In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from plot import vaccination_rate_plot
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", 100)

In [2]:
features_df = pd.read_csv(
    "training_set_features.csv", 
    index_col="respondent_id"
)
labels_df = pd.read_csv(
    "training_set_labels.csv", 
    index_col="respondent_id"
)

In [3]:
from sklearn.preprocessing import StandardScaler

## FILL NA
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
##
from sklearn.compose import ColumnTransformer

from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!
from sklearn import preprocessing
from dirty_cat import SuperVectorizer

## MODELS
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_regression
##
from sklearn.model_selection import GridSearchCV

In [4]:
parameters = {
    'hidden_layer_sizes': range(100,500,100),
    'activation' : [ 'tanh', 'relu'],
    'learning_rate_init': [0.001,0.002,0.003,0.004,0.005]
}


clf = GridSearchCV(estimator= MLPClassifier(random_state = RANDOM_SEED),
                   param_grid = parameters,
                   n_jobs=-1, 
                   scoring='roc_auc',
                   cv=5,
                   verbose=2, refit=True)

In [5]:
estimators = MultiOutputClassifier(
    #estimator=LogisticRegressionCV(cv=5, random_state=0)
    #estimator= xgb.XGBClassifier(objective="multi:softprob", random_state=RANDOM_SEED,num_class = 2,learning_rate= 0.01, n_estimators=300, min_child_weight= 10,gamma= 0.1, maxdepth= 3, subsample= 0.6)
    estimator = clf
    #estimator = CatBoostClassifier()
    #estimator = RandomForestClassifier()
    #estimator = SVC()
    #estimator = MLPClassifier()
)

In [6]:
preprocessor = SuperVectorizer()
#preprocessor = OneHotEncoder(handle_unknown='ignore')


full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ('simple_imputer', SimpleImputer(strategy='mean')),
    #('knn_imputer', KNNImputer(n_neighbors = 5)),
    #('iterative_imputer',IterativeImputer(random_state=0)),
    ("estimators", estimators)
    #('grid_search' , clf)
])
full_pipeline

In [7]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [None]:
### Train model
full_pipeline.fit(X_train, y_train)
#preds = full_pipeline.predict_proba(X_eval)

#preds

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [11]:
preds = full_pipeline.predict(X_eval)
preds

array([[0, 0],
       [0, 0],
       [0, 1],
       ...,
       [0, 0],
       [0, 0],
       [0, 1]], dtype=int64)

In [13]:
y_preds = pd.DataFrame(
    {
       "h1n1_vaccine": preds[0][:, 1],
       "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [14]:
y_preds = pd.DataFrame(
    {
       "h1n1_vaccine": preds[:, 0],
       "seasonal_vaccine": preds[:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0,0
16516,0,0
3106,0,1
16981,1,1
19111,0,1


In [15]:
roc_auc_score(y_eval, y_preds)

0.7379142495369231

In [16]:
full_pipeline[2].estimators_[0].best_params_

{'gamma': 'scale', 'kernel': 'linear'}

In [17]:
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ("estimators", MultiOutputClassifier(estimator = MLPClassifier(hidden_layer_sizes = (200,))))
])

In [10]:
final_pipeline.fit(features_df, labels_df)



In [19]:
preds = final_pipeline.predict(X_eval)
y_preds = pd.DataFrame(
    {
       "h1n1_vaccine": preds[:, 0],
       "seasonal_vaccine": preds[:, 1],
    },
    index = y_eval.index
)
roc_auc_score(y_eval, y_preds)


0.6959070655223973

In [136]:
test_features_df = pd.read_csv("test_set_features.csv", 
                               index_col = 'respondent_id')
test_probas = final_pipeline.predict_proba(test_features_df)
test_probas

[array([[0.91860353, 0.08139647],
        [0.9786918 , 0.0213082 ],
        [0.5842006 , 0.4157994 ],
        ...,
        [0.84411101, 0.15588899],
        [0.94431004, 0.05568996],
        [0.44539718, 0.55460282]]),
 array([[0.70670336, 0.29329664],
        [0.96121181, 0.03878819],
        [0.37666232, 0.62333768],
        ...,
        [0.78838295, 0.21161705],
        [0.66020904, 0.33979096],
        [0.49953826, 0.50046174]])]

In [None]:
from plot import plot_roc
fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

plot_roc(
    y_eval['h1n1_vaccine'], 
    y_preds['h1n1_vaccine'], 
    'h1n1_vaccine',
    ax=ax[0]
)
plot_roc(
    y_eval['seasonal_vaccine'], 
    y_preds['seasonal_vaccine'], 
    'seasonal_vaccine',
    ax=ax[1]
)
fig.tight_layout()

In [138]:
submission_df = pd.read_csv("submission_format.csv", 
                            index_col="respondent_id")
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]


submission_df.head()


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.081396,0.293297
26708,0.021308,0.038788
26709,0.415799,0.623338
26710,0.479483,0.875158
26711,0.19472,0.499422


In [139]:
submission_df.to_csv('my_submission_logisticRegression.csv', index=True)