In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:

data = pd.read_csv("h1n1_vaccine_prediction.csv")

In [3]:
data = data.drop(columns=['unique_id'])
X = data.drop(columns=['h1n1_vaccine'])
y = data['h1n1_vaccine']


In [4]:
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [5]:
label_encoders = {}
for col in X_imputed.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    X_imputed[col] = label_encoders[col].fit_transform(X_imputed[col])

In [6]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()


In [9]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42)
}

In [10]:
# Dictionary to store results
results = {}

# Train, predict, and evaluate each model
for model_name, model in models.items():
    if model_name == 'K-Nearest Neighbors':
        # Use NumPy arrays for KNN
        model.fit(X_train_np, y_train_np)
        y_pred = model.predict(X_test_np)
    else:
        # Continue using DataFrames for other models
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Store the results
    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': report
    }

In [11]:
# Display the results for each model
for model_name, result in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Classification Report:\n{result['classification_report']}")
    print("-" * 50)

Model: Logistic Regression
Accuracy: 0.8399475851740921
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      4212
           1       0.70      0.42      0.53      1130

    accuracy                           0.84      5342
   macro avg       0.78      0.69      0.71      5342
weighted avg       0.83      0.84      0.82      5342

--------------------------------------------------
Model: Decision Tree
Accuracy: 0.7444777236989891
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      4212
           1       0.41      0.46      0.43      1130

    accuracy                           0.74      5342
   macro avg       0.63      0.64      0.63      5342
weighted avg       0.76      0.74      0.75      5342

--------------------------------------------------
Model: SVM
Accuracy: 0.8388244103332085
Classification Report:
              precision    re