In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

In [2]:
features_path = (r'C:\Users\aman\Downloads\dataset and all\training_set_features.csv')
labels_path = (r"C:\Users\aman\Downloads\dataset and all\training_set_labels.csv")
test_features_path = (r'C:\Users\aman\Downloads\dataset and all\test_set_features.csv')


In [7]:
features_df = pd.read_csv(features_path)
labels_df = pd.read_csv(labels_path)
test_features_df = pd.read_csv(test_features_path)

In [9]:
# Merge features and labels on respondent_id
data = pd.merge(features_df, labels_df, on='respondent_id')

# Separate features and target variables
X = data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = data[['xyz_vaccine', 'seasonal_vaccine']]

In [14]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [11]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the preprocessing pipeline to the training and testing data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train logistic regression models for each target variable
model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)

model_xyz.fit(X_train, y_train['xyz_vaccine'])
model_seasonal.fit(X_train, y_train['seasonal_vaccine'])

In [13]:
# Predict probabilities for the test set
y_pred_prob_xyz = model_xyz.predict_proba(X_test)[:, 1]
y_pred_prob_seasonal = model_seasonal.predict_proba(X_test)[:, 1]

# Evaluate the model using ROC AUC score
auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_prob_xyz)
auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_prob_seasonal)
mean_auc = np.mean([auc_xyz, auc_seasonal])

print(f'AUC for XYZ Vaccine: {auc_xyz}')
print(f'AUC for Seasonal Vaccine: {auc_seasonal}')
print(f'Mean AUC: {mean_auc}')

AUC for XYZ Vaccine: 0.831351217339418
AUC for Seasonal Vaccine: 0.8560669086421782
Mean AUC: 0.8437090629907981


In [15]:
# Apply the same preprocessing pipeline to the test data
X_test_new = preprocessor.transform(test_features_df)

# Predict probabilities for the new test set
new_pred_prob_xyz = model_xyz.predict_proba(X_test_new)[:, 1]
new_pred_prob_seasonal = model_seasonal.predict_proba(X_test_new)[:, 1]

# Create a DataFrame for the predictions
submission_df = pd.DataFrame({
    'respondent_id': test_features_df['respondent_id'],
    'xyz_vaccine': new_pred_prob_xyz,
    'seasonal_vaccine': new_pred_prob_seasonal
})

# Display the first few rows of the submission DataFrame
print(submission_df.head())

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707     0.050005          0.296906
1          26708     0.046367          0.046432
2          26709     0.367194          0.515821
3          26710     0.514111          0.881469
4          26711     0.150119          0.457567
