In [2]:
# importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [3]:
# loading the data
training_set_features = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')
test_set_features = pd.read_csv('test_set_features.csv')

In [4]:
# Separating the features and target variables
X = training_set_features.drop(columns=['respondent_id'])
y = training_set_labels[['xyz_vaccine', 'seasonal_vaccine']]

In [5]:
# Spliting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# encoding the categorical features
categorical_cols = X_train.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_val_encoded = encoder.transform(X_val[categorical_cols])

In [7]:
# Traing
svc_xyz = LinearSVC(random_state=42, dual=False, max_iter=10000)
svc_seasonal = LinearSVC(random_state=42, dual=False, max_iter=10000)

svc_xyz.fit(X_train_encoded, y_train['xyz_vaccine'])
svc_seasonal.fit(X_train_encoded, y_train['seasonal_vaccine'])

In [8]:
# Making predictions on the validation set
y_val_pred_xyz = svc_xyz.decision_function(X_val_encoded)
y_val_pred_seasonal = svc_seasonal.decision_function(X_val_encoded)

In [9]:
#  ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_seasonal)

In [10]:
# mean ROC AUC score and their values
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print('ROC AUC for xyz_vaccine:', roc_auc_xyz)
print('ROC AUC for seasonal_vaccine:', roc_auc_seasonal)
print('Mean ROC AUC:', mean_roc_auc)

ROC AUC for xyz_vaccine: 0.6237075485969292
ROC AUC for seasonal_vaccine: 0.7130124709261751
Mean ROC AUC: 0.6683600097615522


In [11]:
# Preprocessing of test set
test_set_encoded = encoder.transform(test_set_features[categorical_cols])

In [12]:
# predictions on the test set
test_pred_xyz = svc_xyz.decision_function(test_set_encoded)
test_pred_seasonal = svc_seasonal.decision_function(test_set_encoded)

In [13]:
# Creating the submission DataFrame
finalsubmission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': 1 / (1 + np.exp(-test_pred_xyz)),  # Apply sigmoid function to get probabilities
    'seasonal_vaccine': 1 / (1 + np.exp(-test_pred_seasonal))
})

In [14]:
# Saving the submission file
finalsubmission.to_csv('finalsubmission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
