In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

def preprocess_data(df):
    # One hot encode with a new class for missing values
    def one_hot_encode_with_missing(df, column, missing_value_class):
        df[column].fillna(missing_value_class, inplace=True)
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[[column]])
        columns = [f"{column}_{cat}" for cat in encoder.categories_[0]]
        df_encoded = pd.DataFrame(encoded, columns=columns, index=df.index)
        df = pd.concat([df.drop(columns=[column]), df_encoded], axis=1)
        return df

    # Fill missing values with 0
    def fill_missing_with_zero(df, columns):
        for column in columns:
            df[column].fillna(0, inplace=True)
        return df

    # Fill missing values with the most frequent class
    def fill_missing_with_most_frequent(df, column):
        most_frequent = df[column].mode()[0]
        df[column].fillna(most_frequent, inplace=True)
        return df

    # Columns to preprocess
    binary_columns = [
        'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
        'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz',
        'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
        'opinion_seas_sick_from_vacc'
    ]

    # Handle 'employment_occupation' and 'employment_industry'
    df = one_hot_encode_with_missing(df, 'employment_occupation', 'Missing')
    df = one_hot_encode_with_missing(df, 'employment_industry', 'Missing')

    # Handle 'household_adults' and 'household_children'
    df = fill_missing_with_zero(df, ['household_adults', 'household_children'])

    # One hot encode 'hhs_geo_region' and 'census_msa'
    df = one_hot_encode_with_missing(df, 'hhs_geo_region', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'census_msa', 'Missing')  # Treat missing values as a separate class

    # Handle 'employment_status'
    df['employment_status'].fillna('Unemployed', inplace=True)
    df = one_hot_encode_with_missing(df, 'employment_status', 'Unemployed')

    # Handle 'rent_or_own'
    df = one_hot_encode_with_missing(df, 'rent_or_own', 'Missing')

    # Handle 'marital_status'
    df['marital_status'].fillna('Not Married', inplace=True)
    df = one_hot_encode_with_missing(df, 'marital_status', 'Not Married')

    # Handle 'income_poverty'
    df['income_poverty'].fillna('<= $75,000, Above Poverty', inplace=True)
    df = one_hot_encode_with_missing(df, 'income_poverty', '<= $75,000, Above Poverty')

    # One hot encode 'sex', 'race', and 'age_group'
    df = one_hot_encode_with_missing(df, 'sex', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'race', 'Missing')  # Treat missing values as a separate class
    df = one_hot_encode_with_missing(df, 'age_group', 'Missing')  # Treat missing values as a separate class

    # Handle 'education'
    most_frequent_education = df['education'].mode()[0]
    df['education'].fillna(most_frequent_education, inplace=True)
    df = one_hot_encode_with_missing(df, 'education', most_frequent_education)
    df = df.drop(columns=['respondent_id'])

    # Handle remaining binary columns
    for column in binary_columns:
        df = fill_missing_with_most_frequent(df, column)

    return df

def load_and_preprocess_data(features_csv, labels_csv, test_features_csv):
    # Load the data
    df_features = pd.read_csv(features_csv)
    df_labels = pd.read_csv(labels_csv)
    df_test_features = pd.read_csv(test_features_csv)

    # Preprocess the features
    df_features = preprocess_data(df_features)
    df_test_features = preprocess_data(df_test_features)

    return df_features, df_labels, df_test_features

def create_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_nn_and_predict(features, labels, test_features):
    # Split the labels into the two target columns
    y_xyz_vaccine = labels['xyz_vaccine']
    y_seasonal_vaccine = labels['seasonal_vaccine']

    # Standardize the features
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    test_features = scaler.transform(test_features)


    # Create and train the model for xyz_vaccine
    model_xyz = create_nn_model(input_dim=features.shape[1])
    early_stopping = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)
    model_xyz.fit(features, y_xyz_vaccine, epochs=200, batch_size=32, callbacks=[early_stopping], verbose=2)
    final_accuracy_xyz = model_xyz.evaluate(features, y_xyz_vaccine, verbose=0)[1]
    print(f"Final Accuracy for xyz_vaccine: {final_accuracy_xyz:.4f}")

    # Create and train the model for seasonal_vaccine
    model_seasonal = create_nn_model(input_dim=features.shape[1])
    model_seasonal.fit(features, y_seasonal_vaccine, epochs=200, batch_size=32, callbacks=[early_stopping], verbose=2)
    final_accuracy_seasonal = model_seasonal.evaluate(features, y_seasonal_vaccine, verbose=0)[1]
    print(f"Final Accuracy for seasonal_vaccine: {final_accuracy_seasonal:.4f}")

    # Predict probabilities for the test set
    predictions_xyz = model_xyz.predict(test_features).flatten()
    predictions_seasonal = model_seasonal.predict(test_features).flatten()

    return predictions_xyz, predictions_seasonal

def create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv):
    # Load the test features to get respondent_ids
    df_test_features = pd.read_csv(test_features_csv)
    df_submission = pd.DataFrame({
        'respondent_id': df_test_features['respondent_id'],
        'xyz_vaccine': predictions_xyz,
        'seasonal_vaccine': predictions_seasonal
    })
    df_submission.to_csv(output_csv, index=False)

# Example usage:
features_csv = 'training_set_features.csv'
labels_csv = 'training_set_labels.csv'
test_features_csv = 'test_set_features.csv'
output_csv = 'output_nn.csv'

df_features, df_labels, df_test_features = load_and_preprocess_data(features_csv, labels_csv, test_features_csv)
predictions_xyz, predictions_seasonal = train_nn_and_predict(df_features, df_labels, df_test_features)
create_submission_file(test_features_csv, predictions_xyz, predictions_seasonal, output_csv)


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


835/835 - 1s - 1ms/step - accuracy: 0.8129 - loss: 0.4325
Epoch 2/200
835/835 - 1s - 633us/step - accuracy: 0.8341 - loss: 0.3933
Epoch 3/200
835/835 - 1s - 617us/step - accuracy: 0.8377 - loss: 0.3819
Epoch 4/200
835/835 - 1s - 610us/step - accuracy: 0.8406 - loss: 0.3763
Epoch 5/200
835/835 - 1s - 615us/step - accuracy: 0.8418 - loss: 0.3741
Epoch 6/200
835/835 - 1s - 625us/step - accuracy: 0.8425 - loss: 0.3692
Epoch 7/200
835/835 - 1s - 617us/step - accuracy: 0.8443 - loss: 0.3661
Epoch 8/200
835/835 - 1s - 615us/step - accuracy: 0.8450 - loss: 0.3635
Epoch 9/200
835/835 - 1s - 620us/step - accuracy: 0.8478 - loss: 0.3591
Epoch 10/200
835/835 - 1s - 625us/step - accuracy: 0.8475 - loss: 0.3556
Epoch 11/200
835/835 - 1s - 621us/step - accuracy: 0.8502 - loss: 0.3538
Epoch 12/200
835/835 - 1s - 610us/step - accuracy: 0.8509 - loss: 0.3511
Epoch 13/200
835/835 - 1s - 633us/step - accuracy: 0.8523 - loss: 0.3486
Epoch 14/200
835/835 - 1s - 626us/step - accuracy: 0.8538 - loss: 0.3442
E