In [17]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
    # One hot encode with a new class for missing values
    def one_hot_encode_with_missing(df, column, missing_value_class):
        df[column].fillna(missing_value_class, inplace=True)
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[[column]])
        columns = [f"{column}_{cat}" for cat in encoder.categories_[0]]
        df_encoded = pd.DataFrame(encoded, columns=columns, index=df.index)
        df = pd.concat([df.drop(columns=[column]), df_encoded], axis=1)
        return df

    # Fill missing values with 0
    def fill_missing_with_zero(df, columns):
        for column in columns:
            df[column].fillna(0, inplace=True)
        return df

    # Fill missing values with the most frequent class
    def fill_missing_with_most_frequent(df, column):
        most_frequent = df[column].mode()[0]
        df[column].fillna(most_frequent, inplace=True)
        return df

    # Columns to preprocess
    binary_columns = [
        'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
        'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz',
        'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
        'opinion_seas_sick_from_vacc'
    ]

    # Handle 'employment_occupation' and 'employment_industry'
    df = one_hot_encode_with_missing(df, 'employment_occupation', 'Missing')
    df = one_hot_encode_with_missing(df, 'employment_industry', 'Missing')

    # Handle 'household_adults' and 'household_children'
    df = fill_missing_with_zero(df, ['household_adults', 'household_children'])

    # One hot encode 'hhs_geo_region' and 'census_msa'
    df = one_hot_encode_with_missing(df, 'hhs_geo_region', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'census_msa', 'Missing')  # Treat missing values as a separate class

    # Handle 'employment_status'
    df['employment_status'].fillna('Unemployed', inplace=True)
    df = one_hot_encode_with_missing(df, 'employment_status', 'Unemployed')

    # Handle 'rent_or_own'
    df = one_hot_encode_with_missing(df, 'rent_or_own', 'Missing')

    # Handle 'marital_status'
    df['marital_status'].fillna('Not Married', inplace=True)
    df = one_hot_encode_with_missing(df, 'marital_status', 'Not Married')

    # Handle 'income_poverty'
    df['income_poverty'].fillna('<= $75,000, Above Poverty', inplace=True)
    df = one_hot_encode_with_missing(df, 'income_poverty', '<= $75,000, Above Poverty')

    # One hot encode 'sex', 'race', and 'age_group'
    df = one_hot_encode_with_missing(df, 'sex', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'race', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'age_group', 'Missing')  # Treat missing values as a separate class

    # Handle 'education'
    most_frequent_education = df['education'].mode()[0]
    df['education'].fillna(most_frequent_education, inplace=True)
    df = one_hot_encode_with_missing(df, 'education', most_frequent_education)
    df = df.drop(columns=['respondent_id'])

    # Handle remaining binary columns
    for column in binary_columns:
        df = fill_missing_with_most_frequent(df, column)

    return df

def load_and_preprocess_data(features_csv, labels_csv, test_features_csv):
    # Load the data
    df_features = pd.read_csv(features_csv)
    df_labels = pd.read_csv(labels_csv)
    df_test_features = pd.read_csv(test_features_csv)

    # Preprocess the features
    df_features = preprocess_data(df_features)
    df_test_features = preprocess_data(df_test_features)

    return df_features, df_labels, df_test_features

def train_rf_and_predict(features, labels, test_features):
    # Split the labels into the two target columns
    y_xyz_vaccine = labels['xyz_vaccine']
    y_seasonal_vaccine = labels['seasonal_vaccine']

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 12,16,18],
        'criterion': ['gini', 'entropy']
    }

    # Train Random Forest for xyz_vaccine
    rf_xyz = RandomForestClassifier()
    grid_search_xyz = GridSearchCV(rf_xyz, param_grid, refit=True, verbose=3, cv=5, n_jobs=-1)
    grid_search_xyz.fit(features, y_xyz_vaccine)
    best_model_xyz = grid_search_xyz.best_estimator_

    # Print the best parameters and best score for xyz_vaccine
    print("Best Parameters for xyz_vaccine:", grid_search_xyz.best_params_)
    print("Best Score for xyz_vaccine:", grid_search_xyz.best_score_)

    # Train Random Forest for seasonal_vaccine
    rf_seasonal = RandomForestClassifier()
    grid_search_seasonal = GridSearchCV(rf_seasonal, param_grid, refit=True, verbose=3, cv=5, n_jobs=-1)
    grid_search_seasonal.fit(features, y_seasonal_vaccine)
    best_model_seasonal = grid_search_seasonal.best_estimator_

    # Print the best parameters and best score for seasonal_vaccine
    print("Best Parameters for seasonal_vaccine:", grid_search_seasonal.best_params_)
    print("Best Score for seasonal_vaccine:", grid_search_seasonal.best_score_)

    # Predict probabilities for the test set
    predictions_xyz = best_model_xyz.predict_proba(test_features)[:, 1]
    predictions_seasonal = best_model_seasonal.predict_proba(test_features)[:, 1]

    return predictions_xyz, predictions_seasonal

def create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv):
    # Load the test features to get respondent_ids
    df_test_features = pd.read_csv(test_features_csv)
    df_submission = pd.DataFrame({
        'respondent_id': df_test_features['respondent_id'],
        'xyz_vaccine': predictions_xyz,
        'seasonal_vaccine': predictions_seasonal
    })
    df_submission.to_csv(output_csv, index=False)

# Example usage:
features_csv = 'training_set_features.csv'
labels_csv = 'training_set_labels.csv'
test_features_csv = 'test_set_features.csv'
output_csv = 'output67890.csv'

df_features, df_labels, df_test_features = load_and_preprocess_data(features_csv, labels_csv, test_features_csv)
predictions_xyz, predictions_seasonal = train_rf_and_predict(df_features, df_labels, df_test_features)
create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters for xyz_vaccine: {'criterion': 'entropy', 'max_depth': 16, 'max_features': 'sqrt', 'n_estimators': 200}
Best Score for xyz_vaccine: 0.8365598422690445
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters for seasonal_vaccine: {'criterion': 'entropy', 'max_depth': 18, 'max_features': 'sqrt', 'n_estimators': 300}
Best Score for seasonal_vaccine: 0.7825665782337926


In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
    def one_hot_encode_with_missing(df, column, missing_value_class):
        df[column].fillna(missing_value_class, inplace=True)
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[[column]])
        columns = [f"{column}_{cat}".replace("[", "").replace("]", "").replace("<", "") for cat in encoder.categories_[0]]
        df_encoded = pd.DataFrame(encoded, columns=columns, index=df.index)
        df = pd.concat([df.drop(columns=[column]), df_encoded], axis=1)
        return df

    def fill_missing_with_zero(df, columns):
        for column in columns:
            df[column].fillna(0, inplace=True)
        return df

    def fill_missing_with_most_frequent(df, column):
        most_frequent = df[column].mode()[0]
        df[column].fillna(most_frequent, inplace=True)
        return df

    binary_columns = [
        'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
        'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz',
        'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
        'opinion_seas_sick_from_vacc'
    ]

    df = one_hot_encode_with_missing(df, 'employment_occupation', 'Missing')
    df = one_hot_encode_with_missing(df, 'employment_industry', 'Missing')
    df = fill_missing_with_zero(df, ['household_adults', 'household_children'])
    df = one_hot_encode_with_missing(df, 'hhs_geo_region', 'Missing')
    df = one_hot_encode_with_missing(df, 'census_msa', 'Missing')
    df['employment_status'].fillna('Unemployed', inplace=True)
    df = one_hot_encode_with_missing(df, 'employment_status', 'Unemployed')
    df = one_hot_encode_with_missing(df, 'rent_or_own', 'Missing')
    df['marital_status'].fillna('Not Married', inplace=True)
    df = one_hot_encode_with_missing(df, 'marital_status', 'Not Married')
    df['income_poverty'].fillna('<= $75,000, Above Poverty', inplace=True)
    df = one_hot_encode_with_missing(df, 'income_poverty', '<= $75,000, Above Poverty')
    df = one_hot_encode_with_missing(df, 'sex', 'Missing')
    df = one_hot_encode_with_missing(df, 'race', 'Missing')
    df = one_hot_encode_with_missing(df, 'age_group', 'Missing')
    most_frequent_education = df['education'].mode()[0]
    df['education'].fillna(most_frequent_education, inplace=True)
    df = one_hot_encode_with_missing(df, 'education', most_frequent_education)
    df = df.drop(columns=['respondent_id'])

    for column in binary_columns:
        df = fill_missing_with_most_frequent(df, column)

    return df

def load_and_preprocess_data(features_csv, labels_csv, test_features_csv):
    df_features = pd.read_csv(features_csv)
    df_labels = pd.read_csv(labels_csv)
    df_test_features = pd.read_csv(test_features_csv)

    df_features = preprocess_data(df_features)
    df_test_features = preprocess_data(df_test_features)

    return df_features, df_labels, df_test_features

def train_xgb_and_predict(features, labels, test_features):
    y_xyz_vaccine = labels['xyz_vaccine']
    y_seasonal_vaccine = labels['seasonal_vaccine']

    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'gamma': [0, 0.1, 0.2]
    }

    xgb_xyz = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid_search_xyz = GridSearchCV(xgb_xyz, param_grid, refit=True, verbose=3, cv=5, n_jobs=-1)
    grid_search_xyz.fit(features, y_xyz_vaccine)
    best_model_xyz = grid_search_xyz.best_estimator_

    print("Best Parameters for xyz_vaccine:", grid_search_xyz.best_params_)
    print("Best Score for xyz_vaccine:", grid_search_xyz.best_score_)

    xgb_seasonal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid_search_seasonal = GridSearchCV(xgb_seasonal, param_grid, refit=True, verbose=3, cv=5, n_jobs=-1)
    grid_search_seasonal.fit(features, y_seasonal_vaccine)
    best_model_seasonal = grid_search_seasonal.best_estimator_

    print("Best Parameters for seasonal_vaccine:", grid_search_seasonal.best_params_)
    print("Best Score for seasonal_vaccine:", grid_search_seasonal.best_score_)

    predictions_xyz = best_model_xyz.predict_proba(test_features)[:, 1]
    predictions_seasonal = best_model_seasonal.predict_proba(test_features)[:, 1]

    return predictions_xyz, predictions_seasonal

def create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv):
    df_test_features = pd.read_csv(test_features_csv)
    df_submission = pd.DataFrame({
        'respondent_id': df_test_features['respondent_id'],
        'xyz_vaccine': predictions_xyz,
        'seasonal_vaccine': predictions_seasonal
    })
    df_submission.to_csv(output_csv, index=False)

# Example usage:
features_csv = 'training_set_features.csv'
labels_csv = 'training_set_labels.csv'
test_features_csv = 'test_set_features.csv'
output_csv = 'output12345.csv'

df_features, df_labels, df_test_features = load_and_preprocess_data(features_csv, labels_csv, test_features_csv)
predictions_xyz, predictions_seasonal = train_xgb_and_predict(df_features, df_labels, df_test_features)
create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv)


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters for xyz_vaccine: {'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
Best Score for xyz_vaccine: 0.8412028800886258
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters for seasonal_vaccine: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
Best Score for seasonal_vaccine: 0.7883327488356603


In [15]:
df_features.head(17899)

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race_White,age_group_18 - 34 Years,age_group_35 - 44 Years,age_group_45 - 54 Years,age_group_55 - 64 Years,age_group_65+ Years,education_12 Years,education_ 12 Years,education_College Graduate,education_Some College
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17894,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
17895,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17896,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
17897,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
