In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier


In [9]:

# Load the training set features
train_data = pd.read_csv('training_set_features.csv')

# Load the training set labels
train_labels = pd.read_csv('training_set_labels.csv')

# Load the test set features
test_data = pd.read_csv('test_set_features.csv')

# Load the submission format
submission_format = pd.read_csv('submission_format.csv')


In [10]:
# Display the first few rows of each dataset
print("Training Set Features:")
print(train_data.head())

print("\nTraining Set Labels:")
print(train_labels.head())

print("\nTest Set Features:")
print(test_data.head())

print("\nSubmission Format:")
print(submission_format.head())

# Check for missing values in each dataset
print("\nMissing Values in Training Set Features:")
print(train_data.isnull().sum())

print("\nMissing Values in Training Set Labels:")
print(train_labels.isnull().sum())

print("\nMissing Values in Test Set Features:")
print(test_data.isnull().sum())



Training Set Features:
   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0    

In [27]:
# Define categorical and numerical features based on your dataset
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
numerical_features = ['household_adults', 'household_children']

# Check if columns exist and handle missing columns
for feature in categorical_features + numerical_features:
    if feature not in train_data.columns:
        print(f"Column {feature} is missing in train_data")

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features + numerical_features] = imputer.fit_transform(train_data[categorical_features + numerical_features])
test_data[categorical_features + numerical_features] = imputer.transform(test_data[categorical_features + numerical_features])

# Encode categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_train_features = encoder.fit_transform(train_data[categorical_features])
encoded_test_features = encoder.transform(test_data[categorical_features])

# Normalize numerical features
scaler = StandardScaler()
scaled_train_features = scaler.fit_transform(train_data[numerical_features])
scaled_test_features = scaler.fit_transform(test_data[numerical_features])

# Combine preprocessed features
X_train = np.hstack((encoded_train_features, scaled_train_features))
X_test = np.hstack((encoded_test_features, scaled_test_features))

# Extract target variables
y_train = train_labels[['xyz_vaccine', 'seasonal_vaccine']].values






In [12]:
# Display the column names of the training set features
print(train_data.columns)





Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')


In [22]:
# Display the column names of the training set labels
print(train_labels.columns)







Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')


In [24]:

# Initialize the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)

# Train the model
model.fit(X_train, y_train)


In [30]:
# Predict probabilities on test data
test_predictions = model.predict_proba(X_test)

# Extract the probabilities for the positive class (class 1)
xyz_vaccine_probs = test_predictions[0][:, 1]
seasonal_vaccine_probs = test_predictions[1][:, 1]



In [33]:
# Display a sample of the predictions
print("xyz Vaccine Probabilities:", xyz_vaccine_probs[:10])
print("Seasonal Vaccine Probabilities:", seasonal_vaccine_probs[:10])


xyz Vaccine Probabilities: [0.14305299 0.15720037 0.2154799  0.2176365  0.11296255 0.51142761
 0.24079305 0.21394798 0.24559814 0.27674042]
Seasonal Vaccine Probabilities: [0.37015785 0.1971173  0.37489668 0.66876516 0.28979249 0.72554615
 0.7202547  0.42236753 0.51808837 0.73353789]


In [35]:
# Prepare submission file
submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

# Confirm that the file is saved and display the first few rows
print(submission.head())


   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707     0.143053          0.370158
1          26708     0.157200          0.197117
2          26709     0.215480          0.374897
3          26710     0.217636          0.668765
4          26711     0.112963          0.289792
