In [46]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score

In [6]:
income_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header = None, sep = ", ")
income_data.columns = ["age", "workclass", "fnlwgt", "education5th-6th, Preschool.", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"]
income_data.head()

  if __name__ == '__main__':


Unnamed: 0,age,workclass,fnlwgt,"education5th-6th, Preschool.",education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
features = pd.get_dummies(income_data.drop("label", axis=1))
X_train, X_test, y_train, y_test = train_test_split(features, income_data.label)

In [17]:
logistic_regression = LogisticRegression()

In [19]:
params = {"penalty": ["l1", "l2"], "C": [0.01, 0.1, 10, 100], "fit_intercept": [True, False], "max_iter": [50, 100, 1000]}
grid = GridSearchCV(logistic_regression, params, cv = 5, n_jobs = -1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 10, 100], 'fit_intercept': [True, False], 'max_iter': [50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [20]:
grid.best_params_

{'C': 100, 'fit_intercept': False, 'max_iter': 1000, 'penalty': 'l1'}

In [27]:
logistic_regression = LogisticRegression(C = 100, fit_intercept=False, penalty="l1")
logistic_regression.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
predicted = logistic_regression.predict(X_test)
confusion_matrix(y_test, predicted)

array([[5715,  390],
       [ 815, 1221]])

In [44]:
print("<=50K: " + str(income_data.label[income_data.label == "<=50K"].count()))
print(">50K: " + str(income_data.label[income_data.label == ">50K"].count()))

label_codes = LabelEncoder().fit_transform(y_test)
predicted_codes = LabelEncoder().fit_transform(predicted)

print(precision_score(label_codes, predicted_codes))
print(recall_score(label_codes, predicted_codes))

<=50K: 24720
>50K: 7841
0.75791433892
0.599705304519


In [45]:
###

In [49]:
random_forest = RandomForestClassifier()
params = {"n_estimators": [5, 10, 20], "max_depth": [None, 2, 5]}
forest_grid = GridSearchCV(random_forest, params, cv = 5, n_jobs = -1)
forest_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [5, 10, 20], 'max_depth': [None, 2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [50]:
random_forest = forest_grid.best_estimator_

In [51]:
random_forest.score(X_test, y_test)

0.8478073946689596

In [52]:
predicted = random_forest.predict(X_test)
confusion_matrix(y_test, predicted)

array([[5681,  424],
       [ 815, 1221]])

In [54]:
label_codes = LabelEncoder().fit_transform(y_test)
predicted_codes = LabelEncoder().fit_transform(predicted)

print(precision_score(label_codes, predicted_codes))
print(recall_score(label_codes, predicted_codes))

0.742249240122
0.599705304519
