In [59]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression

In [60]:
# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [61]:
# Identify categorical and numerical columns
categorical_features = train_features.select_dtypes(include=['object']).columns.tolist()
numeric_features = train_features.select_dtypes(include=[np.number]).columns.tolist()

In [62]:
# Remove 'respondent_id' from the features as it's not a predictive attribute
numeric_features.remove('respondent_id')

In [63]:
# Split the training set into features and labels (excluding 'respondent_id')
X_train = train_features.drop('respondent_id', axis=1)
y_train_xyz = train_labels['xyz_vaccine']
y_train_seasonal = train_labels['seasonal_vaccine']

In [64]:
# Preprocessing for numerical and categorical data

# changing the default BayesianRidge to LineraRegression to converge better
preprocessor = ColumnTransformer(
    transformers=[
        ('num', IterativeImputer(estimator=LinearRegression(),max_iter=30), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [65]:
# Define the models that can handle missing values directly
model_xyz = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_seasonal = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Create pipelines for each target variable
pipeline_xyz = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model_xyz)])
pipeline_seasonal = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', model_seasonal)])

In [66]:
# Splitting the training data for validation purposes
X_train_split, X_val_split, y_train_split_xyz, y_val_split_xyz = train_test_split(X_train, y_train_xyz, test_size=0.2, random_state=42)
X_train_split, X_val_split, y_train_split_seasonal, y_val_split_seasonal = train_test_split(X_train, y_train_seasonal, test_size=0.2, random_state=42)

In [67]:
# Train the models on the training split
pipeline_xyz.fit(X_train_split, y_train_split_xyz)
pipeline_seasonal.fit(X_train_split, y_train_split_seasonal)

In [68]:
# Evaluate the models on the validation split
val_predictions_xyz = pipeline_xyz.predict_proba(X_val_split)[:, 1]
val_predictions_seasonal = pipeline_seasonal.predict_proba(X_val_split)[:, 1]
print(val_predictions_xyz)
print(val_predictions_seasonal)

[0.12844169 0.14161048 0.08930988 ... 0.73160499 0.06811546 0.04014181]
[0.20425618 0.18552783 0.83706536 ... 0.63588354 0.55373301 0.66101469]


In [69]:
roc_auc_score(y_val_split_xyz, val_predictions_xyz)
roc_auc_score(y_val_split_seasonal, val_predictions_seasonal)
print(roc_auc_score)

<function roc_auc_score at 0x0000026EBD2C8CC0>


In [70]:
# Predict on test set (excluding 'respondent_id' from features)
test_predictions_xyz = pipeline_xyz.predict_proba(test_features.drop('respondent_id', axis=1))[:, 1]
test_predictions_seasonal = pipeline_seasonal.predict_proba(test_features.drop('respondent_id', axis=1))[:, 1]
print(test_predictions_xyz)
print(test_predictions_seasonal)

[0.11918968 0.03431236 0.16458618 ... 0.14693432 0.03810482 0.58904286]
[0.24963985 0.05354874 0.57514252 ... 0.2698271  0.26473242 0.68953029]


In [75]:
# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_predictions_xyz,
    'seasonal_vaccine': test_predictions_seasonal
})
submission.to_csv('C:/Users/anwes/Downloads/dataset and all/submission.csv', index=False)