# Import libraries
Import libraries necessary for this project

In [48]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, fbeta_score, roc_auc_score, make_scorer
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold,StratifiedKFold, GridSearchCV
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.neighbors import KNeighborsClassifier


# Pretty display for notebooks
%matplotlib inline

# Data Loading
Load the data from CSV in pandas dataframe

In [49]:
data = pd.read_csv("census.csv")


# Data Exploration

In this section, we explore our dataset in order to:
- get familiar with the structure of the dataset
- identify numerical and categorical variables.
- check if there are missing data.
- analyse the distribution (skewness) of the features and the target variable.

### Look some records in the dataset 

In [51]:
data.head(2)

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K


### Check out the datatypes of the features

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 14 columns):
age                45222 non-null int64
workclass          45222 non-null object
education_level    45222 non-null object
education-num      45222 non-null float64
marital-status     45222 non-null object
occupation         45222 non-null object
relationship       45222 non-null object
race               45222 non-null object
sex                45222 non-null object
capital-gain       45222 non-null float64
capital-loss       45222 non-null float64
hours-per-week     45222 non-null float64
native-country     45222 non-null object
income             45222 non-null object
dtypes: float64(4), int64(1), object(9)
memory usage: 4.8+ MB


Based on that information, we can identify whether a variable is numerical or categorical.

** Numerical variables:**

- age: continuous.
- education-num: continuous.
- capital-gain: continuous.
- capital-loss: continuous.
- hours-per-week: continuous.

** Categorical variables:**
- workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- education_level: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- race: Black, White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other.
- sex: Female, Male.
- native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

### Statistical description of the dataset

We can analyse the main statistical metrics for both numerical and categorical features

In [61]:
# Summary statistics for numerical variables
data.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,45222.0,45222.0,45222.0,45222.0,45222.0
mean,38.547941,10.11846,1101.430344,88.595418,40.938017
std,13.21787,2.552881,7506.430084,404.956092,12.007508
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [62]:
# Summary statistics for categorical variables
data.describe(include=np.object)

Unnamed: 0,workclass,education_level,marital-status,occupation,relationship,race,sex,native-country,income
count,45222,45222,45222,45222,45222,45222,45222,45222,45222
unique,7,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,<=50K
freq,33307,14783,21055,6020,18666,38903,30527,41292,34014


Separate columns according to transformation they apply

In [3]:
# numerical
num_cols = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# categorical
cat_cols = ['workclass', 'education_level', 'marital-status', 'occupation', 
            'relationship', 'race', 'sex', 'native-country']

# need log transformation
log_transform_cols = ['capital-gain', 'capital-loss']


Functions used in the pipeline

In [4]:
# select numerical columns
def get_num_cols(X):
    return X[num_cols]

# select categorical columns
def get_categorical_cols(X):
    return X[cat_cols]

# select the columns that need log transformation
def get_log_transform_cols(X):
    return X[log_transform_cols]

# one-hot encode the categorical variables
def get_dummies(X):
    return pd.get_dummies(X)


Pipeline steps

In [5]:
# log transform Pipeline
log_transform_pipeline = make_pipeline(
 FunctionTransformer(get_log_transform_cols, validate=False),
 Imputer(strategy='mean'),
 FunctionTransformer(np.log1p))

# Pipeline for all numerical variables: select numerical columns and then apply scaling
num_preprocess_pipeline = make_pipeline(
    FunctionTransformer(get_num_cols, validate=False),
    Imputer(strategy='mean'),
    MinMaxScaler())

# Pipeline for all categorical variables: select those columns and then apply one-hot encoding
cat_preprocess_pipeline = make_pipeline(
    FunctionTransformer(get_categorical_cols, validate=False),
    FunctionTransformer(get_dummies, validate = False))


Concatenates results of multiple pipelines

In [6]:
# Preprocess pipeline: joining the steps
preprocess_pipeline = FeatureUnion([
        ('log_transform_pipeline', log_transform_pipeline),
        ('num_preprocess_pipeline', num_preprocess_pipeline),
        ('cat_preprocess_pipeline', cat_preprocess_pipeline)])


In [7]:
# Preprocess the target: binarize the target label
targetLabel = {'<=50K': 0, '>50K': 1}
y = data['income'].map(targetLabel)

# Preprocess the data
X = pd.DataFrame(preprocess_pipeline.fit_transform(data))
# Split the data into training and test
X_train_original, X_test, y_train_original, y_test = train_test_split(X,y, test_size=0.30, random_state=10)




In [8]:
#Split the training data into folds
random_state = 10

In [9]:
class sklearnClassifier(object):
    def __init__(self, clf, random_state, params):
        params['random_state'] = random_state
        self.clf = clf(**params)
        
    def training(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)
        
    def feature_importance(self, x):
        return self.clf.feature_importances_

In [10]:
def crossValidationSearch(clf, parameters, n_splits,x_train_,y_train_):
    #split the data in train/test sets
    cv = StratifiedKFold(n_splits=5,random_state=random_state)
    scorer = make_scorer(fbeta_score, beta=0.5)
    grid = GridSearchCV(clf,param_grid=parameters,scoring=scorer)

    grid.fit(x_train_, y_train_)
    best_clf = grid.best_estimator_

    return best_clf
    

In [28]:
def stackingModel(list_clf, x_train_, y_train_, x_test_):
    # Partition the training data into 5 folds
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

    #training set for level 2
    train_meta = x_train_.copy()

    #test set for level 2
    test_meta = x_test_.copy()
    
    for i in range(len(list_clf)):
        train_meta["p_M%d"%i] = np.nan
        test_meta["p_M%d"%i] = np.nan
    
    

    # fit the models to each fold training dataset and predict the rest of training fold 
    for idx_train, idx_test in kf.split(x_train_):

        # get the training and testing data for fold each fold
        x_kf_train, x_kf_test = x_train_.iloc[idx_train], x_train_.iloc[idx_test]

        # get the training and testing target for fold each fold
        y_kf_train, y_kf_test = y_train_.iloc[idx_train], y_train_.iloc[idx_test]

        for clf_idx, clf in enumerate(list_clf):
            
            # Fit the based model to the training fold
            clf.fit(x_kf_train,y_kf_train)

            # Make predictions to the training fold for model 1 and save it in train_meta
            train_meta['p_M%d'%clf_idx].iloc[idx_test] = clf.predict_proba(x_kf_test)[:,1]

    # fit the models to the whole training dataset and predict the test dataset
    for clf_idx, clf in enumerate(list_clf):
        clf.fit(x_train_,y_train_)
        test_meta['p_M%d' %clf_idx] = clf.predict_proba(x_test_)[:,1]
    
    return train_meta, test_meta
    

In [12]:
# Set param grid for AdaBoost classifier
parametersAda = {'n_estimators': [200,300,400], 'learning_rate': [1]}

# Instatiate adaBoost classifier
ada_clf = AdaBoostClassifier(random_state=random_state)

# Search the best classifier with stratified KFold
best_ada_clf = crossValidationSearch(ada_clf, parametersAda, 5,X_train_original,y_train_original)

In [13]:
# Set param grid for GaussianNB classifier
parametersGnb = {}

# Instatiate  GaussianNB classifier
gnb_clf = GaussianNB()

# Search the best classifier with stratified KFold
best_gnb_clf = crossValidationSearch(gnb_clf, parametersGnb, 5,X_train_original,y_train_original)


In [14]:
# Set param for logistic regression
log_param_grid = {}
# Instatiate logistic regression classifier
log_clf = LogisticRegression(random_state=random_state)

# search the best classifier with stratified KFold
best_log_clf = crossValidationSearch(log_clf, log_param_grid, 5, X_train_original, y_train_original)

In [15]:
# set param grid for random forest classifier
rf_param_grid = {'min_samples_split': [5,10,20],'n_estimators':[50, 100, 200]}

# Instantiate random forest classifier
rf_clf = RandomForestClassifier(random_state=random_state)

# search the best classifier with stratified KFold
best_rf_clf = crossValidationSearch(rf_clf, rf_param_grid, 5, X_train_original, y_train_original)


In [16]:
# set param grid for knearest neighbor classifier
kn_param_grid = {'n_neighbors': [2,5,10]}

# Initiate knearest neighbor classifier
kn_clf = KNeighborsClassifier()

# search the best classifier with stratified KFold
best_kn_clf = crossValidationSearch(kn_clf, kn_param_grid, 5, X_train_original, y_train_original)

In [33]:
best_clfs=[best_ada_clf,best_rf_clf,best_log_clf]
train_meta, test_meta = stackingModel(best_clfs, X_train_original, y_train_original, X_test)

In [34]:
#fit the stacking model
stk_clf = AdaBoostClassifier(random_state=random_state)

X_train_meta =train_meta[["p_M%d"  %d for d in range(len(best_clfs))]]

# Search the best classifier with stratified KFold
best_stack_clf = crossValidationSearch(ada_clf, parametersAda, 5,X_train_meta,y_train_original)

In [35]:
# fit best model
best_stack_clf.fit(X_train_meta,y_train_original)

# X test meta
X_test_meta = test_meta[["p_M%d" %d for d in range(len(best_clfs))]]

# make prediction on the test_meta using the stacked model
#y_stack_pred = best_stack_clf.predict(X_test_meta)
#y_stack_pred_prob = best_stack_clf.predict_proba(X_test_meta)[:,1]

In [21]:
#evaluate the model: classification metrics
def evaluateClassifiers(clf, X_test_, y_test_):

    y_test_pred = clf.predict(X_test_)
    y_test_pred_prob = clf.predict_proba(X_test_)[:,1]
    f_score = fbeta_score(y_test_, y_test_pred, beta=0.5)
    print("fBeta score is: %.02f" %f_score)

    auc = roc_auc_score(y_test_, y_test_pred_prob)
    print("AUC is: %.02f" %auc)

In [23]:
# Evaluate all the models
print("level 1 adaboost")
evaluateClassifiers(best_ada_clf, X_test, y_test)
print("level 1 rf")
evaluateClassifiers(best_rf_clf, X_test, y_test)
print('level 1 log')
evaluateClassifiers(best_log_clf, X_test, y_test)
print('level 1 kn')
evaluateClassifiers(best_kn_clf, X_test, y_test)
print('level 1 gnb')
evaluateClassifiers(best_gnb_clf, X_test, y_test)


level 1 adaboost
fBeta score is: 0.74
AUC is: 0.92
level 1 rf
fBeta score is: 0.73
AUC is: 0.91
level 1 log
fBeta score is: 0.69
AUC is: 0.90
level 1 kn
fBeta score is: 0.69
AUC is: 0.88
level 1 gnb
fBeta score is: 0.42
AUC is: 0.81
stacked model
fBeta score is: 0.75
AUC is: 0.86


In [38]:
print('stacked model')
evaluateClassifiers(best_stack_clf, X_test_meta, y_test)

stacked model
fBeta score is: 0.75
AUC is: 0.92


In [42]:
# load data from csv
test_competition = pd.read_csv("test_census.csv") 

# Preprocess the data
X_competition = preprocess_pipeline.fit_transform(test_competition)

for clf_idx, clf in enumerate(best_clfs):
    clf.fit(X_train_original,y_train_original)
    test_competition["p_M%d"%clf_idx] = np.nan
    test_competition['p_M%d' %clf_idx] = clf.predict_proba(X_competition)[:,1]



test_meta_competition = test_competition[["p_M%d" %d for d in range(len(best_clfs))]]
# Predict the new data
test_competition['id'] = test_competition.iloc[:,0]
y_pred_competition = pd.DataFrame(test_competition['id'], columns=['id'])
y_pred_competition['income'] = best_stack_clf.predict_proba(test_meta_competition)[:,1]


In [43]:
y_pred_competition.to_csv('submission.csv', index=False)

In [47]:
y_pred_competition.drop('income',axis=1)

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
