In [1]:
# !pip install pandas
import pandas as pd
import numpy as np

### Importa as features da base de treino

In [2]:
training_set_features = pd.read_csv('/content/drive/MyDrive/DrivenData/case_h1n1_flu/python/training_set_features.csv')
training_set_features.head()

FileNotFoundError: ignored

### Importa as features da base de teste

In [None]:
test_set_features = pd.read_csv('/content/drive/MyDrive/DrivenData/case_h1n1_flu/python/test_set_features.csv')
test_set_features.head()

### Importa a variável target de cada case

In [None]:
training_set_labels = pd.read_csv('/content/drive/MyDrive/DrivenData/case_h1n1_flu/python/training_set_labels.csv')
training_set_labels.tail()

### Cruza base das features de treino com a base de variáveis target para treinamento dos modelos

In [None]:
# criando histórico da abt de treino - apenas 1 safra
df_treino = (
    training_set_features
    .merge(training_set_labels, on='respondent_id', how='left')
)
df_treino.tail()

In [None]:
key_vars = ['respondent_id']

num_vars = [
'h1n1_concern',
'h1n1_knowledge',
'behavioral_antiviral_meds',
'behavioral_avoidance',
'behavioral_face_mask',
'behavioral_wash_hands',
'behavioral_large_gatherings',
'behavioral_outside_home',
'behavioral_touch_face',
'doctor_recc_h1n1',
'doctor_recc_seasonal',
'chronic_med_condition',
'child_under_6_months',
'health_worker',
'health_insurance',
'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk',
'opinion_h1n1_sick_from_vacc',
'opinion_seas_vacc_effective',
'opinion_seas_risk',
'opinion_seas_sick_from_vacc',
'household_adults',
'household_children',
]

cat_vars = [
'age_group',
'education',
'race',
'sex',
'income_poverty',
'marital_status',
'rent_or_own',
'employment_status',
'hhs_geo_region',
'census_msa'
]
target_h1n1 = 'h1n1_vaccine'
target_seasonal = 'seasonal_vaccine'

features = cat_vars + num_vars

# dados de treinamento
X_train = df_treino[features]

#target h1n1
y_train_h1n1 = df_treino[target_h1n1]

# target seasonal
y_train_seasonal = df_treino[target_seasonal]

# dados de teste
X_test_h1n1 = test_set_features[features]
# y_test_h1n1 = target_h1n1

### Rodar o modelo utilizando o pipeline

In [None]:
# !pip install feature-engine

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper

### Modelos de pessoas propensas a tomar a vacina do H1N1

### Regressão Logística

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)), 
                ('numeric_scaler', SklearnTransformerWrapper(variables=num_vars, transformer=StandardScaler())), 
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', LogisticRegression(random_state=42))
])

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(estimator=lr_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
lr_model_results = pd.DataFrame(cv_results)

In [None]:
# criando uma tabela vazia que irá conter os resultados dos modelos
df_metrics = {'model': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [],'roc_auc': []}
pd.DataFrame(df_metrics)

In [None]:
accuracy = lr_model_results.mean().loc['test_accuracy']
precision = lr_model_results.mean().loc['test_precision']
recall = lr_model_results.mean().loc['test_recall']
f1 = lr_model_results.mean().loc['test_f1']
roc_auc = lr_model_results.mean().loc['test_roc_auc']

# populando a tabela com os resultados da regressão logística
df_metrics['model'].append('Logistic Regression')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)
pd.DataFrame(df_metrics)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', DecisionTreeClassifier(random_state=42))
])

cv_results = cross_validate(estimator=tree_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
tree_model_results = pd.DataFrame(cv_results)
tree_model_results

accuracy = tree_model_results.mean().loc['test_accuracy']
precision = tree_model_results.mean().loc['test_precision']
recall = tree_model_results.mean().loc['test_recall']
f1 = tree_model_results.mean().loc['test_f1']
roc_auc = tree_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('Decision Tree')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', RandomForestClassifier(random_state=42))
])

cv_results = cross_validate(estimator=rf_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
rf_model_results = pd.DataFrame(cv_results)
rf_model_results

accuracy = rf_model_results.mean().loc['test_accuracy']
precision = rf_model_results.mean().loc['test_precision']
recall = rf_model_results.mean().loc['test_recall']
f1 = rf_model_results.mean().loc['test_f1']
roc_auc = rf_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('Random Forest')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

### Support Vector Machine - SVM

In [None]:
from sklearn.svm import SVC

svm_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', SVC(random_state=42))
])

cv_results = cross_validate(estimator=svm_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
svm_model_results = pd.DataFrame(cv_results)
svm_model_results

accuracy = svm_model_results.mean().loc['test_accuracy']
precision = svm_model_results.mean().loc['test_precision']
recall = svm_model_results.mean().loc['test_recall']
f1 = svm_model_results.mean().loc['test_f1']
roc_auc = svm_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('SVM')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', GradientBoostingClassifier(random_state=42))
])

cv_results = cross_validate(estimator=gb_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
gb_model_results = pd.DataFrame(cv_results)
gb_model_results

accuracy = gb_model_results.mean().loc['test_accuracy']
precision = gb_model_results.mean().loc['test_precision']
recall = gb_model_results.mean().loc['test_recall']
f1 = gb_model_results.mean().loc['test_f1']
roc_auc = gb_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('Gradient Boosting')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

### XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', XGBClassifier(random_state=42))
])

cv_results = cross_validate(estimator=xgb_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
xgb_model_results = pd.DataFrame(cv_results)
xgb_model_results

accuracy = xgb_model_results.mean().loc['test_accuracy']
precision = xgb_model_results.mean().loc['test_precision']
recall = xgb_model_results.mean().loc['test_recall']
f1 = xgb_model_results.mean().loc['test_f1']
roc_auc = xgb_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('XGBoost')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

###Light Gradient Boosting (LGBM)

In [None]:
from lightgbm import LGBMClassifier

lgbm_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', LGBMClassifier(random_state=42))
])

cv_results = cross_validate(estimator=lgbm_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
lgbm_model_results = pd.DataFrame(cv_results)
lgbm_model_results

accuracy = lgbm_model_results.mean().loc['test_accuracy']
precision = lgbm_model_results.mean().loc['test_precision']
recall = lgbm_model_results.mean().loc['test_recall']
f1 = lgbm_model_results.mean().loc['test_f1']
roc_auc = lgbm_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('LGBM')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

### Catboost

In [None]:
from catboost import CatBoostClassifier

cat_model = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),  
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')), 
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', CatBoostClassifier(random_state=42))
])

cv_results = cross_validate(estimator=cat_model, X=X_train, y=y_train_h1n1, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
cat_model_results = pd.DataFrame(cv_results)
cat_model_results

accuracy = cat_model_results.mean().loc['test_accuracy']
precision = cat_model_results.mean().loc['test_precision']
recall = cat_model_results.mean().loc['test_recall']
f1 = cat_model_results.mean().loc['test_f1']
roc_auc = cat_model_results.mean().loc['test_roc_auc']

df_metrics['model'].append('Catboost')
df_metrics['accuracy'].append(accuracy)
df_metrics['precision'].append(precision)
df_metrics['recall'].append(recall)
df_metrics['f1'].append(f1)
df_metrics['roc_auc'].append(roc_auc)

pd.DataFrame(df_metrics)

In [None]:
df_metrics_results = pd.DataFrame(df_metrics)
df_metrics_results.sort_values(by='roc_auc', ascending=False)

In [None]:
cat_model[-1].get_params()

In [None]:
# treinando o modelo campeão na base de treino completa
cat_model.fit(X_train, y_train_h1n1)

In [None]:
from sklearn.metrics import plot_roc_curve

plot_roc_curve(cat_model, X_train, y_train_h1n1)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(cat_model, X_train, y_train_h1n1, values_format='d', display_labels=['tomou', 'não tomou'])

In [None]:
y_pred = cat_model.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

accuracy_oot  = accuracy_score(y_train_h1n1, y_pred)
precision_oot = precision_score(y_train_h1n1, y_pred)
recall_oot    = recall_score(y_train_h1n1, y_pred)
f1_oot        = f1_score(y_train_h1n1, y_pred)

print(f"Accuracy OOT: {accuracy_oot:.3f}")
print(f"Precision OOT: {precision_oot:.3f}")
print(f"Recall OOT: {recall_oot:.3f}")
print(f"F1: {f1_oot:.3f}")

In [None]:
# !pip install shap
import shap
explainer = shap.TreeExplainer(cat_model[-1])
X_train_transformado = cat_model[:-1].transform(X_train)
shap_values = explainer.shap_values(X_train_transformado)
shap.summary_plot(shap_values, X_train_transformado)

In [None]:
y_pred_test = cat_model.predict(X_test_h1n1)
y_pred_test

In [None]:
y_proba_test = cat_model.predict_proba(X_test_h1n1)[:,1]
y_proba_test = pd.DataFrame(y_proba_test, columns=['h1n1_vaccine'])
y_proba_test.head()

In [None]:
respondent_id_test = pd.DataFrame(test_set_features['respondent_id'], columns=['respondent_id'])
type(respondent_id_test)

In [None]:
respondent_id_test.shape, y_proba_test.shape

In [None]:
pred_h1n1 = pd.concat([respondent_id_test, y_proba_test], axis=1, join="inner")
pred_h1n1.head()

In [None]:
pred_h1n1.to_csv('/content/drive/MyDrive/DrivenData/case_h1n1_flu/python/pred_h1n1.csv', index=False)