In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from timeit import default_timer as timer
%matplotlib inline

In [2]:
with open("us_census_full/census_income_metadata.txt") as f:
    lines = f.readlines()
header_lines = lines[142:]
headers = [line.split(":")[0].replace(" ", "_") for line in header_lines]
headers.remove('|_instance_weight')
headers.append('earnings')

In [3]:
df_train = pd.read_csv("us_census_full/census_income_learn.csv", names=headers)

In [4]:
df_test = pd.read_csv("us_census_full/census_income_test.csv", names=headers)

In [5]:
del df_train['instance_weight']
del df_test['instance_weight']

In [6]:
df_train['is_duplicated'] = df_train.duplicated(list(df_train)[:-1])

In [7]:
df_test['is_duplicated'] = df_test.duplicated(list(df_test)[:-1])

In [8]:
df_train_unique = df_train.loc[df_train['is_duplicated'] == False]
del df_train_unique['is_duplicated']

In [9]:
df_test_unique = df_test.loc[df_test['is_duplicated'] == False]
del df_test_unique['is_duplicated']

In [10]:
df_train_unique.shape

(152807, 41)

In [11]:
earnings_vals = df_train['earnings'].unique()

In [12]:
earnings_vals

array([' - 50000.', ' 50000+.'], dtype=object)

In [13]:
df_train_new = df_train_unique.copy()
df_test_new = df_test_unique.copy()

In [14]:
df_train_new['Y'] = df_train_new['earnings'] == earnings_vals[1]
df_test_new['Y'] = df_test_new['earnings'] == earnings_vals[1]

In [15]:
del df_train_new['earnings']
del df_test_new['earnings']

In [16]:
Y_train = df_train_new['Y']
Y_test = df_test_new['Y']

In [17]:
del df_train_new['Y']
del df_test_new['Y']

In [18]:
# Ok now let's 1-hot encode all object variables, even including missing data

In [19]:
df_train_new.shape

(152807, 40)

In [20]:
train_size = len(df_train_new)

In [21]:
full_data = pd.concat([df_train_new, df_test_new])
full_data_objects = full_data.select_dtypes(include=['object'])
for col in full_data_objects:
    new_df = pd.get_dummies(full_data[col], prefix=col)
    full_data = pd.concat([full_data, new_df], axis=1)
    del full_data[col]
df_train_new = full_data[0:train_size].copy()
df_test_new = full_data[train_size:].copy()

In [22]:
df_train_new.shape

(152807, 408)

In [23]:
# I'm going to use a grid search to test some different parameters on the full dataset

## Decision tree

In [None]:
dec_tree = DecisionTreeClassifier(random_state=0)
dec_tree_parameters = [{'class_weight': [None, "balanced"],
                        'max_depth': [3, 5, 10],
                        'max_features': [30, 100, 300],
                        'min_samples_leaf': [1, 5, 10]}]

dec_tree_clf = GridSearchCV(dec_tree, dec_tree_parameters, cv=5, scoring="accuracy", n_jobs=2)
dec_tree_clf.fit(df_train_new,Y_train)
best_estimator = dec_tree_clf.best_estimator_

In [29]:
print('Best score: ' + str(dec_tree_clf.best_score_))
print('Best hyperparameters: ' + str(dec_tree_clf.best_params_))

Best score: 0.936998959472
Best hyperparameters: {'class_weight': None, 'max_features': 300, 'max_depth': 10, 'min_samples_leaf': 10}


## Boosted decision tree

In [36]:
start = timer()

boost_tree = GradientBoostingClassifier(random_state=0)
boost_tree_parameters = [{'max_depth': [10], # 3, 5
                         'max_features': [300], # 30, 100, 
                         'min_samples_leaf': [10], # 1, 5, 
                         'n_estimators': [50, 100, 500]}] # 20

boost_tree_clf = GridSearchCV(boost_tree, boost_tree_parameters, cv=5, scoring="accuracy", n_jobs=2, verbose=1)
boost_tree_clf.fit(df_train_new,Y_train)
best_estimator_boost = boost_tree_clf.best_estimator_

end = timer()
print("Time to run 5 CV, 3 combos: ", end - start) 

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 349.2min finished


Time to run 5 CV, 3 combos:  23811.57471499199


In [None]:
print('Best score: ' + str(boost_tree_clf.best_score_))
print('Best hyperparameters: ' + str(boost_tree_clf.best_params_))

Best score: 0.944446262279
Best hyperparameters: {'max_features': 300, 'max_depth': 10, 'n_estimators': 100, 'min_samples_leaf': 10}


## Random Forest

In [24]:
start = timer()

rand_for = RandomForestClassifier(random_state=0)
rand_for_parameters = [{'max_depth': [10], #3, 5, 
                        'max_features': [300], # 30, 100, 
                        'min_samples_leaf': [10], # 1, 5, 
                        'n_estimators': [50, 100, 500]}] # 20, 

rand_for_clf = GridSearchCV(rand_for, rand_for_parameters, cv=5, scoring="accuracy", n_jobs=2, verbose=1)
rand_for_clf.fit(df_train_new,Y_train)
best_estimator_rand_for = rand_for_clf.best_estimator_

end = timer()
print("Time to run 5 CV, 3 combos: ", end - start) 

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed: 100.4min finished


Time to run 5 CV, 3 combos:  6342.686642062999


In [25]:
print('Best score: ' + str(rand_for_clf.best_score_))
print('Best hyperparameters: ' + str(rand_for_clf.best_params_))

Best score: 0.938988397128
Best hyperparameters: {'max_features': 300, 'max_depth': 10, 'n_estimators': 100, 'min_samples_leaf': 10}
