# ML Project: Classification 

## Predict what has the most influence over income 

- Education vs Salary
- Sex vs Salary 
- Race vs Salary
- Native-Country vs Salary

$50K/yr based on census data

Authors:
`Andrea Murphy` and `Josh Quigley`

## Setup

In [1]:
%matplotlib inline
import random

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import matplotlib.pyplot as plt


In [2]:
import pylab as plot
params = { 
    "axes.labelsize": "large",
    "xtick.labelsize" : "x-large",
    "legend.fontsize": 20,
    "figure.dpi": 150,
    "figure.figsize" : [25, 7]
}
plot.rcParams.update(params)

In [3]:

data = pd.read_csv('data/adult.data.txt')
test_data = pd.read_csv('data/adult.test.txt') 

data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race",
                           "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary"]
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32559,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [5]:
type(data.index)

pandas.core.indexes.range.RangeIndex

In [6]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

## Education vs Salary 

In [7]:
# drop columns won't be useful in analysis and prediction
data_education = data.drop(['fnlwgt', 'education-num'], axis=1)

In [9]:
data_education.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [11]:
data_education.shape

(32560, 13)

## Decreasing the sample size `n=100` for ease of use

In [14]:
data_education.sample(n=100, random_state=10)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11358,41,Private,Bachelors,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
10859,38,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
30948,24,Private,HS-grad,Separated,Machine-op-inspct,Unmarried,Other,Female,0,0,40,United-States,<=50K
29811,35,Self-emp-inc,Bachelors,Separated,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,<=50K
18408,69,?,Bachelors,Married-civ-spouse,?,Husband,White,Male,10605,0,10,United-States,>50K
2879,31,Private,HS-grad,Never-married,Handlers-cleaners,Other-relative,White,Male,0,0,40,United-States,<=50K
21575,35,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,<=50K
26396,25,Private,HS-grad,Separated,Exec-managerial,Unmarried,White,Female,0,0,37,United-States,<=50K
28644,56,Private,Assoc-voc,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K
15694,22,Without-pay,HS-grad,Never-married,Handlers-cleaners,Own-child,White,Male,4416,0,40,United-States,<=50K


## Sex vs Salary

In [22]:
# drop columns won't be useful in analysis and prediction
data_sex = data.drop(['fnlwgt', 'education-num', 'education', 'race'], axis=1)

In [23]:
data_sex.head()

Unnamed: 0,age,workclass,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,50,Self-emp-not-inc,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
1,38,Private,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
2,53,Private,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
3,28,Private,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K
4,37,Private,Married-civ-spouse,Exec-managerial,Wife,Female,0,0,40,United-States,<=50K


In [24]:
data_sex.shape

(32560, 11)

## Decreasing the sample size `n=100` for ease of use

In [25]:
data_sex.sample(n=100, random_state=10)

Unnamed: 0,age,workclass,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11358,41,Private,Married-civ-spouse,Sales,Husband,Male,0,0,50,United-States,>50K
10859,38,Private,Married-civ-spouse,Sales,Husband,Male,0,0,60,United-States,>50K
30948,24,Private,Separated,Machine-op-inspct,Unmarried,Female,0,0,40,United-States,<=50K
29811,35,Self-emp-inc,Separated,Prof-specialty,Not-in-family,Female,0,0,50,United-States,<=50K
18408,69,?,Married-civ-spouse,?,Husband,Male,10605,0,10,United-States,>50K
2879,31,Private,Never-married,Handlers-cleaners,Other-relative,Male,0,0,40,United-States,<=50K
21575,35,Private,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,45,United-States,<=50K
26396,25,Private,Separated,Exec-managerial,Unmarried,Female,0,0,37,United-States,<=50K
28644,56,Private,Married-civ-spouse,Exec-managerial,Husband,Male,15024,0,40,United-States,>50K
15694,22,Without-pay,Never-married,Handlers-cleaners,Own-child,Male,4416,0,40,United-States,<=50K


## Race vs Salary

In [26]:
# drop columns won't be useful in analysis and prediction
data_race = data.drop(['fnlwgt', 'education-num', 'education','age'], axis=1)

In [27]:
data_race.head()

Unnamed: 0,workclass,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,Self-emp-not-inc,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,Private,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,Private,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,Private,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,Private,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [28]:
data_race.shape

(32560, 11)

## Decreasing the sample size `n=100` for ease of use

In [29]:
data_race.sample(n=100, random_state=10)

Unnamed: 0,workclass,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11358,Private,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
10859,Private,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
30948,Private,Separated,Machine-op-inspct,Unmarried,Other,Female,0,0,40,United-States,<=50K
29811,Self-emp-inc,Separated,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,<=50K
18408,?,Married-civ-spouse,?,Husband,White,Male,10605,0,10,United-States,>50K
2879,Private,Never-married,Handlers-cleaners,Other-relative,White,Male,0,0,40,United-States,<=50K
21575,Private,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,<=50K
26396,Private,Separated,Exec-managerial,Unmarried,White,Female,0,0,37,United-States,<=50K
28644,Private,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K
15694,Without-pay,Never-married,Handlers-cleaners,Own-child,White,Male,4416,0,40,United-States,<=50K


## Native-Country vs Salary

In [30]:
# drop columns won't be useful in analysis and prediction
data_country = data.drop(['fnlwgt', 'education-num', 'education','race'], axis=1)

In [None]:
data_country

## Decreasing the sample size `n=100` for ease of use

In [None]:
data_country.sample(n=100, random_state=10)