In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from plot import vaccination_rate_plot
pd.set_option("display.max_columns", 100)

In [2]:
features_df = pd.read_csv(
    "training_set_features.csv", 
    index_col="respondent_id"
)
labels_df = pd.read_csv(
    "training_set_labels.csv", 
    index_col="respondent_id"
)

In [3]:
import matplotlib.pyplot as plt

In [4]:
joined_df = features_df.join(labels_df)


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!
from sklearn import preprocessing
from dirty_cat import SuperVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor

In [20]:
preprocessor = SuperVectorizer()
estimators = MultiOutputClassifier(
    estimator=LogisticRegression(penalty="l2", C=1)
)
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ('simple_imputer', SimpleImputer(strategy='mean')),
    ("estimators", estimators),
])
full_pipeline

In [21]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)


In [22]:
# Train model
full_pipeline.fit(X_train, y_train)
preds = full_pipeline.predict_proba(X_eval)
preds

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[array([[0.66034523, 0.33965477],
        [0.91045635, 0.08954365],
        [0.79043215, 0.20956785],
        ...,
        [0.94611352, 0.05388648],
        [0.91513041, 0.08486959],
        [0.93784717, 0.06215283]]),
 array([[0.50151369, 0.49848631],
        [0.53549147, 0.46450853],
        [0.21988121, 0.78011879],
        ...,
        [0.65495858, 0.34504142],
        [0.8706171 , 0.1293829 ],
        [0.10348695, 0.89651305]])]

In [23]:
full_pipeline.score(X_train, y_train)
preds

[array([[0.66034523, 0.33965477],
        [0.91045635, 0.08954365],
        [0.79043215, 0.20956785],
        ...,
        [0.94611352, 0.05388648],
        [0.91513041, 0.08486959],
        [0.93784717, 0.06215283]]),
 array([[0.50151369, 0.49848631],
        [0.53549147, 0.46450853],
        [0.21988121, 0.78011879],
        ...,
        [0.65495858, 0.34504142],
        [0.8706171 , 0.1293829 ],
        [0.10348695, 0.89651305]])]

In [24]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)


y_preds.shape: (8814, 2)


In [25]:
roc_auc_score(y_eval, y_preds)


0.8449722033393452

In [19]:
full_pipeline.fit(features_df, labels_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
test_features_df = pd.read_csv("test_set_features.csv", 
                               index_col="respondent_id")
test_probas = full_pipeline.predict_proba(test_features_df)
test_probas

In [None]:
submission_df = pd.read_csv("submission_format.csv", 
                            index_col="respondent_id")
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()


In [None]:
submission_df.to_csv('my_submission_cat.csv', index=True)