# ML Project: Classification 

## Predict what has the most influence over income 

- Education vs Salary
- Sex vs Salary 
- Race vs Salary
- Native-Country vs Salary

$50K/yr based on census data

Authors:
`Andrea Murphy` and `Josh Quigley`

## Setup

In [1]:
%matplotlib inline
import random

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import matplotlib.pyplot as plt


In [2]:
import pylab as plot
params = { 
    "axes.labelsize": "large",
    "xtick.labelsize" : "x-large",
    "legend.fontsize": 20,
    "figure.dpi": 150,
    "figure.figsize" : [25, 7]
}
plot.rcParams.update(params)

In [3]:

data = pd.read_csv('data/adult.data.txt')
test_data = pd.read_csv('data/adult.test.txt') 

data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race",
                           "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary"]
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [4]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0
32559,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,1


In [5]:
type(data.index)

pandas.core.indexes.range.RangeIndex

In [6]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

## Education vs Salary 

In [248]:
# drop columns won't be useful in analysis and prediction
#data_education = data.drop(['fnlwgt', 'hours-per-week', 'age', 'native-country', 'sex', 'race', 'relationship', 'occupation', 'marital-status', 'education', 'workclass', 'capital-gain', 'capital-loss'], axis=1)
data_education = data.drop(['fnlwgt'], axis=1)



In [249]:
# sample 1000 random entries to train
data_education = data_education.sample(n=1000, random_state=10)


In [250]:
data_education.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11358,41,Private,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,1
10859,38,Private,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,1
30948,24,Private,HS-grad,9,Separated,Machine-op-inspct,Unmarried,Other,Female,0,0,40,United-States,0
29811,35,Self-emp-inc,Bachelors,13,Separated,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,0
18408,69,?,Bachelors,13,Married-civ-spouse,?,Husband,White,Male,10605,0,10,United-States,1


In [251]:
data_education.shape

(1000, 14)

In [252]:
data_education["education-num"].value_counts()

9     313
10    212
13    193
14     56
7      38
11     36
12     30
6      27
15     20
4      20
8      16
5      14
3      11
16      9
2       3
1       2
Name: education-num, dtype: int64

We will focus on the 3 largest categories, Highschool Graduates (education-num 9), Some College Education (education-num 10), and Bachelors Degrees (education-num 13). 

In [253]:
numeric_variables = list(data_education.dtypes[data_education.dtypes != "object"].index)
data_education = data_education[numeric_variables]

In [254]:
X_train = data_education.iloc[:900]
X_train = X_train.drop(['salary'], axis=1)

X_test = data_education.iloc[900:]
X_test = X_test.drop(['salary'], axis=1)

y_train = data_education.iloc[:900]
y_train = y_train['salary']

y_test = data_education.iloc[900:]
y_test = y_test['salary']

In [255]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=20)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [256]:
from sklearn.model_selection import cross_val_score
cross_val_score(tree_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.75083056, 0.73666667, 0.72240803])

In [257]:
from sklearn.model_selection import cross_val_predict
y_train_pred_tree_clf = cross_val_predict(tree_clf, X_train, y_train, cv=3)

In [258]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred_tree_clf)

array([[550, 112],
       [130, 108]])

In [259]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train, y_train_pred_tree_clf)

0.4909090909090909

In [260]:
recall_score(y_train, y_train_pred_tree_clf)

0.453781512605042

Try on the test data

In [261]:
y_pred_tree_clf = tree_clf.predict(X_test)

In [262]:
confusion_matrix(y_test, y_pred_tree_clf)

array([[61, 14],
       [13, 12]])

In [263]:
precision_score(y_test, y_pred_tree_clf)

0.46153846153846156

In [264]:
recall_score(y_test, y_pred_tree_clf)

0.48

## Sex vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_sex = data.drop(['fnlwgt', 'education-num', 'education', 'race'], axis=1)

In [None]:
data_sex.head()

In [None]:
data_sex.shape

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_sex.sample(n=100, random_state=10)

## Race vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_race = data.drop(['fnlwgt', 'education-num', 'education','age'], axis=1)

In [None]:
data_race.head()

In [None]:
data_race.shape

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_race.sample(n=100, random_state=10)

## Native-Country vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_country = data.drop(['fnlwgt', 'education-num', 'education','race'], axis=1)

In [None]:
data_country

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_country.sample(n=100, random_state=10)