In [None]:
%matplotlib inline

from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [None]:
training_set_features_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
training_set_labels_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

test_set_features_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

submission_df = pd.read_csv("submission_format.csv", index_col="respondent_id")

training_set_features_df

In [None]:
from sklearn.preprocessing import StandardScaler
hierarchical_categories = ["age_group", "education", "sex", "income_poverty", "marital_status", "rent_or_own"]
hierarchical_encoding = Pipeline(
    steps=[
        ('ordinal_encoder', OrdinalEncoder()),
        ('standard_scaler', StandardScaler()),
    ]
)

non_hierarchical_categories = ["employment_industry", "employment_occupation", "race", "employment_status", "hhs_geo_region", "census_msa"]
non_hierarchical_encoder = OneHotEncoder(sparse=False)

In [None]:
from sklearn.svm import SVC
svclassifier = MultiOutputClassifier( SVC(random_state=31415, kernel='linear', probability=True))

In [5]:
%%time
from sklearn.cluster import DBSCAN
encoder = ColumnTransformer(
    transformers = [
        ("non_hierarchical", non_hierarchical_encoder, non_hierarchical_categories),
        ("hierarchical", hierarchical_encoding, hierarchical_categories),
    ], 
    remainder='passthrough'
)
    
complete_pipeline = Pipeline(
    steps=[
        ("encoding", encoder),
        ("replace_nan", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("classify", svclassifier)
    ]
)

cross_val_score(complete_pipeline, training_set_features_df, training_set_labels_df, cv=5, scoring='roc_auc')

In [None]:
%%time

complete_pipeline.fit(training_set_features_df, training_set_labels_df)
test_set_predictions = complete_pipeline.predict_proba(test_set_features_df)

submission_df["h1n1_vaccine"] = test_set_predictions[0][:, 1]
submission_df["seasonal_vaccine"] = test_set_predictions[1][:, 1]

submission_df.to_csv(f"submission_{int(datetime.now().timestamp())}.csv", index=True)

submission_df.head()