# ML Project: Classification 

## Predict what has the most influence over income 

- Education vs Salary
- Sex vs Salary 
- Race vs Salary
- Native-Country vs Salary

$50K/yr based on census data

Authors:
`Andrea Murphy` and `Josh Quigley`

## Setup

In [None]:
%matplotlib inline
import random

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import matplotlib.pyplot as plt


In [None]:
import pylab as plot
params = { 
    "axes.labelsize": "large",
    "xtick.labelsize" : "x-large",
    "legend.fontsize": 20,
    "figure.dpi": 150,
    "figure.figsize" : [25, 7]
}
plot.rcParams.update(params)

In [None]:

data = pd.read_csv('data/adult.data.txt')
test_data = pd.read_csv('data/adult.test.txt') 

data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race",
                           "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary"]
data.head()

In [None]:
data.tail()

In [None]:
type(data.index)

In [None]:
data.isnull().sum()

## Education vs Salary 

In [None]:
# drop columns won't be useful in analysis and prediction
#data_education = data.drop(['fnlwgt', 'hours-per-week', 'age', 'native-country', 'sex', 'race', 'relationship', 'occupation', 'marital-status', 'education', 'workclass', 'capital-gain', 'capital-loss'], axis=1)
data_education = data.drop(['fnlwgt'], axis=1)



In [None]:
# sample 1000 random entries to train
data_education = data_education.sample(n=1000, random_state=10)


In [None]:
data_education.head()

In [None]:
data_education.shape

In [None]:
data_education["education-num"].value_counts()

We will focus on the 3 largest categories, Highschool Graduates (education-num 9), Some College Education (education-num 10), and Bachelors Degrees (education-num 13). 

In [None]:
numeric_variables = list(data_education.dtypes[data_education.dtypes != "object"].index)
data_education = data_education[numeric_variables]

In [None]:
X_train = data_education.iloc[:900]
X_train = X_train.drop(['salary'], axis=1)

X_test = data_education.iloc[900:]
X_test = X_test.drop(['salary'], axis=1)

y_train = data_education.iloc[:900]
y_train = y_train['salary']

y_test = data_education.iloc[900:]
y_test = y_test['salary']

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=20)
tree_clf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(tree_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred_tree_clf = cross_val_predict(tree_clf, X_train, y_train, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred_tree_clf)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train, y_train_pred_tree_clf)

In [None]:
recall_score(y_train, y_train_pred_tree_clf)

Try on the test data

In [None]:
y_pred_tree_clf = tree_clf.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred_tree_clf)

In [None]:
precision_score(y_test, y_pred_tree_clf)

In [None]:
recall_score(y_test, y_pred_tree_clf)

## Sex vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_sex = data.drop(['fnlwgt', 'education-num', 'education', 'race'], axis=1)

In [None]:
data_sex.head()

In [None]:
data_sex.shape

In [None]:
data_sex["sex"].value_counts()

In [None]:
# group by Sex, Salary
grouped = data_sex.groupby(['sex','salary'])

In [None]:
grouped.salary.median()

## apply 1-hot encoding to categorical features `Sex`

In [None]:
data_sex.dtypes

In [None]:
data = pd.get_dummies(data, columns=['sex'], prefix = ['sex'])

In [None]:
data_sex.head()

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_sex.sample(n=100, random_state=10)

## Race vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_race = data.drop(['fnlwgt', 'education-num', 'education','age'], axis=1)

In [None]:
data_race.head()

In [None]:
data_race.shape

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_race.sample(n=100, random_state=10)

## Native-Country vs Salary

In [None]:
# drop columns won't be useful in analysis and prediction
data_country = data.drop(['fnlwgt', 'education-num', 'education','race'], axis=1)

In [None]:
data_country

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_country.sample(n=100, random_state=10)