In [430]:
#Explanation of the code I've written, is given in the comments according to the cells 

#importing the given training and test datasets
import pandas as pd
import numpy as np
train_labels= pd.read_csv('training_set_labels.csv') # two labels ie, 2 outputs 
train_features=pd.read_csv('training_set_features.csv')
test_features=pd.read_csv('test_set_features.csv')

In [432]:
train_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [434]:
train_labels.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [436]:
#other required libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [438]:
#respondent ids are similar in both i/p and target variables, so not much required so we drop it
#and sepearte out the two labels for the two vaccines
X = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

#dataframe contained datas with object datatype, so we categorize then into categorial features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                        ('scaler', StandardScaler())])

#transforming categorial features
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features),
                                               ('cat', categorical_transformer, categorical_features)])


In [440]:
X_train_preprocessed = preprocessor.fit_transform(train_features)

clf_xyz= LogisticRegression()
clf_seasonal=LogisticRegression()

#training
clf_xyz.fit(X_train_preprocessed, y_xyz)
clf_seasonal.fit(X_train_preprocessed, y_seasonal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [442]:
test_features.shape

(26708, 36)

In [444]:
X_test = test_features.drop(columns=['respondent_id'])
X_test_preprocessed = preprocessor.transform(X_test)

# Predict probabilities
xyz_prob= clf_xyz.predict_proba(X_test_preprocessed)[:,1]
seasonal_prob=clf_seasonal.predict_proba(X_test_preprocessed)[:,1]


result = pd.DataFrame({'respondent_id': test_features['respondent_id'],
                           'xyz_vaccine': xyz_prob,
                           'seasonal_vaccine': seasonal_prob})

# Saving submission
result.to_csv('result.csv', index=False)

In [446]:
#evaluating the set using ROC AUC Score
#split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_valid_preprocessed = preprocessor.transform(X_valid)

clf_xyz = LogisticRegression()
clf_seasonal = LogisticRegression()

clf_xyz.fit(X_train_preprocessed, y_train['xyz_vaccine'])
clf_seasonal.fit(X_train_preprocessed, y_train['seasonal_vaccine'])

#predict probabilities
vxyz_probs = clf_xyz.predict_proba(X_valid_preprocessed)[:, 1]
vseasonal_probs = clf_seasonal.predict_proba(X_valid_preprocessed)[:, 1]

#ROC AUC scores
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], vxyz_probs)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], vseasonal_probs)

#mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ROC AUC for xyz_vaccine: 0.8313680256158132
ROC AUC for seasonal_vaccine: 0.8560714246904496
Mean ROC AUC: 0.8437197251531314


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
