In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load datasets
train_features = pd.read_csv('C:/Users/apurv/OneDrive/Desktop/Data Science/Summer Analytics IIT Guwahati/Hackathon 3/dataset and all/training_set_features.csv')
test_features = pd.read_csv('C:/Users/apurv/OneDrive/Desktop/Data Science/Summer Analytics IIT Guwahati/Hackathon 3/dataset and all/test_set_features.csv')
train_labels = pd.read_csv('C:/Users/apurv/OneDrive/Desktop/Data Science/Summer Analytics IIT Guwahati/Hackathon 3/dataset and all/training_set_labels.csv')



In [2]:
# Debugging: Print the column names to check for 'respondent_id'
print("Train Features Columns:", train_features.columns)
print("Train Labels Columns:", train_labels.columns)

Train Features Columns: Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
Train Labels Columns: Index(['    respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')

In [3]:
# Ensure column names have no leading or trailing spaces
train_features.columns = train_features.columns.str.strip()
train_labels.columns = train_labels.columns.str.strip()

In [4]:
# Debugging: Print the column names to check for 'respondent_id'
print("Train Features Columns:", train_features.columns)
print("Train Labels Columns:", train_labels.columns)

Train Features Columns: Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
Train Labels Columns: Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')


In [5]:
# Merge training features with labels
train_df = pd.merge(train_features, train_labels, on='respondent_id')


# Separate features and target variables
X_train = train_df.drop(['xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_train_xyz = train_df['xyz_vaccine']
y_train_seasonal = train_df['seasonal_vaccine']

In [6]:
# Define preprocessing steps for numerical and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values with median
    ('scaler', StandardScaler())  # Standardize numerical features
])

# For categorical features, assuming they are all binary and need encoding
categorical_features = X_train.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [7]:
# Define the models
model_xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

model_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# Train the models
model_xyz.fit(X_train, y_train_xyz)
model_seasonal.fit(X_train, y_train_seasonal)


In [8]:
# Predict probabilities for test set
X_test = test_features  # Assuming test_features.csv contains test data
xyz_vaccine_probs = model_xyz.predict_proba(X_test)[:, 1]
seasonal_vaccine_probs = model_seasonal.predict_proba(X_test)[:, 1]

# Create submission DataFrame
submission_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

# Save predictions to a CSV file
submission_df.to_csv('Vaccine_predictor.csv', index=False)


In [9]:
submission_df

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.055405,0.287941
1,26708,0.046782,0.044357
2,26709,0.411043,0.585090
3,26710,0.497725,0.878421
4,26711,0.162142,0.463299
...,...,...,...
26703,53410,0.342396,0.518435
26704,53411,0.101731,0.271867
26705,53412,0.133189,0.188208
26706,53413,0.061406,0.352923


In [10]:
from IPython.display import FileLink

# Provide a link to download the file
FileLink('Vaccine_predictor.csv')