In [25]:
# Importing Required Packages.
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

# Training labels
training_labels = pd.read_csv('../../Data/training_set_labels.csv', index_col='respondent_id')
training_features = pd.read_csv('../../Data/training_set_features.csv', index_col='respondent_id')

In [26]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(training_features, training_labels['h1n1_vaccine'], test_size=0.33, random_state=42)

In [27]:
# Preproccessing columns
num_cols = []
ohe_cols = []
freq_cols = []


# Seperate columns into numerical, categorical, and freq
for c in training_features.columns:
    if training_features[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif training_features[c].nunique() < 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)

In [28]:
# Scale the data using a MinMax scaler, and fill NaN values with the mean
num_transformer = Pipeline(steps=[
    ('minmaxscaler', MinMaxScaler()),
    ('num_imputer', SimpleImputer(strategy='median'))
])

# Onehot
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('oh_encoder', OneHotEncoder(handle_unknown='ignore'))
])

freq_transformer = Pipeline(steps=[
    ('freq_encoder', ce.count.CountEncoder(normalize=True, min_group_size=.05)),
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value=0))
])


In [36]:


# Package those pieces together using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols)
    ])
preprocessor.fit(X_train)

AttributeError: Transformer num (type Pipeline) does not provide get_feature_names.

In [30]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', GradientBoostingClassifier())
])

In [31]:
# Cross validate test
cross_validate(clf, X_train, y_train)

{'fit_time': array([4.45724797, 3.87114096, 4.29825616, 6.10275221, 3.93941808]),
 'score_time': array([0.05066705, 0.12074804, 0.038378  , 0.05744195, 0.04154992]),
 'test_score': array([0.83766415, 0.83822297, 0.83095837, 0.83594187, 0.84237004])}

In [33]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
clf.fit(X_train, y_train)
_preds = clf.predict(X_test)


print('accuracy: {:0.3f}'.format(accuracy_score(y_test, _preds)))
print('recall: {:0.3f}'.format(recall_score(y_test, _preds)))
print('f1: {:0.3f}'.format(f1_score(y_test, _preds)))
print('roc_auc: {:0.3f}'.format(roc_auc_score(y_test , _preds)))

accuracy: 0.839
recall: 0.436
f1: 0.535
roc_auc: 0.692


TypeError: _check_n_features() missing 2 required positional arguments: 'X' and 'reset'