In [910]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, accuracy_score, roc_auc_score, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.base import clone

%matplotlib inline

data = pd.read_csv("census.csv")
data.head(2)

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K


In [793]:
# Split the data into features and target label and map income to 0 and 1
X = data.drop('income', axis = 1)
y = data['income']
y_mapped = pd.DataFrame(y.map({'<=50K': 0, '>50K': 1}))


In [794]:
class OneHotEncoderTransform(TransformerMixin, BaseEstimator):

    def transform(self, X, **transform_params):
        encoded_features = pd.DataFrame(pd.get_dummies(X))
        return encoded_features

    def fit(self, X, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.transform(X).columns.tolist()

class LogTransform(TransformerMixin, BaseEstimator):

    def transform(self, X, **transform_params):
        log_X = pd.DataFrame(np.log1p(X))
        return log_X

    def fit(self, X, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.transform(X).columns.tolist()
    
class ScaleTransform(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        self.scaler = MinMaxScaler()
        
    def transform(self, X, **transform_params):
        return pd.DataFrame(self.scaler.transform(X))

    def fit(self, X, y=None, **fit_params):
        self.scaler = self.scaler.fit(X.astype(float))
        return self

    def get_feature_names(self):
        return self.transform(X).columns.tolist()

class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))


In [847]:
skewed = ['capital-gain', 'capital-loss']
numerical = ['age', 'education-num', 'hours-per-week']
categorical = ['workclass', 'education_level', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

log_and_scale_transformer = ColumnTransformer(
    [
     ("log_and_scale", Pipeline([('log', LogTransform()),
                                 ('scale', ScaleTransform())]), skewed), 
     ("scaler", ScaleTransform(), numerical)
    ], remainder='drop');

encoder_transformer = ColumnTransformer(
    [
     ("encode", OneHotEncoderTransform(), categorical)
    ], remainder='drop')

preprocessing = Pipeline([
    ('preprocessing', FeatureUnion([
        ('numerical_features', log_and_scale_transformer),
        ('categorical_features', encoder_transformer),
    ]))
])

classifier_pipeline = Pipeline([
    ('classifiers', FeatureUnion([
        ('RF', ModelTransformer(RandomForestClassifier(random_state = 42))),
        ('Ada', ModelTransformer(AdaBoostClassifier(random_state = 42))),
        ('SVC', ModelTransformer(SVC(random_state = 42, gamma='auto')))
    ]))
])

# apply preprocessing pipeline
X_transformed = preprocessing.fit_transform(X)

# separate data in training and testing
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_mapped, test_size = 0.2, random_state = 0)

# initial model evaluation to choose the best model
classifier_pipeline.fit(X_train, y_train.values.ravel())
vec_preds = classifier_pipeline.transform(X_test)
classifier_labels = ['Random Forest', 'AdaBoost', 'SVC'] 
df_preds = pd.DataFrame(vec_preds)
df_preds.columns = classifier_labels
for clf in classifier_labels:
    print(clf,':', fbeta_score(y_test['income'], df_preds[clf], beta = 0.5))



Random Forest : 0.6836744983361903
AdaBoost : 0.7245508982035928
SVC : 0.6744771706996605


In [849]:
pipe = Pipeline([('ada', AdaBoostClassifier(random_state = 42))])

# display possible params: pipe.get_params().keys()

parameters = { 'ada__n_estimators': [100, 200, 300], 
              'ada__learning_rate': [0.01, 0.1, 1]}

scorer = make_scorer(fbeta_score, beta = 0.5)

grid_obj = GridSearchCV(pipe, parameters, scoring = scorer, cv = 5)

grid_fit = grid_obj.fit(X_train, y_train.values.ravel())

best_clf = grid_fit.best_estimator_

# get the predictions from unoptimized and current model
predictions = df_preds['AdaBoost']
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))


Unoptimized model
------
Accuracy score on testing data: 0.8576
F-score on testing data: 0.7246

Optimized Model
------
Final accuracy score on the testing data: 0.8663
Final F-score on the testing data: 0.7425


In [899]:
# use recursive feature elimination to check score on reduced feature set
selector = RFE(best_clf.named_steps['ada'], 5, step=0.10) # reduce 10% at each step
selector = selector.fit(X_train, y_train.values.ravel())
most_important_columns = selector.support_ 

In [900]:
# create a df from X_train and set columns names
columns_labels_numerical = ['capital-gain', 'capital-loss'] + ['age','education-num','hours-per-week']
columns_labels_encoded = preprocessing.named_steps['preprocessing'].transformer_list[1][1].transformers[0][1].get_feature_names()
df_X_train = pd.DataFrame(X_train)
df_X_train.columns = columns_labels_numerical + columns_labels_encoded[5:]

# get name of most important columns
df_X_train.columns[most_important_columns]

Index(['capital-gain', 'capital-loss', 'age', 'education-num',
       'hours-per-week'],
      dtype='object')

In [919]:
# get indices of most important features
idx = np.where(selector.support_)[0]

# Reduce the feature space
X_train_reduced = X_train[:,idx]
X_test_reduced = X_test[:,idx]

# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train.values.ravel())

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)

# Report scores from the final model using both versions of data
print("Final Model trained on full data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
print("\nFinal Model trained on reduced data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))

Final Model trained on full data
------
Accuracy on testing data: 0.8663
F-score on testing data: 0.7425

Final Model trained on reduced data
------
Accuracy on testing data: 0.8404
F-score on testing data: 0.6973
