## **Task** 
The task is to predict if a person's income is in excess of 50,000 given certain profile information, and more specifically to generate the labels for income being above 50,000 for each row in the test set. This will simply be a csv with a single column of the predictions [0,1] with 'wage' as the column header. One member from each group will submit the link to your group's GitHub repository by TBD with the csv file complete, with final changes made by 12:00am EST / 9:00pm PST.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, f1_score, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('./data/large_train_sample.csv')
test = pd.read_csv('./data/test_data.csv')

In [3]:
print(f'Train csv shape is {train.shape}')
print(f'Test csv shape is {test.shape}')

Train csv shape is (32561, 14)
Test csv shape is (16281, 13)


In [4]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [5]:
# We have imbalanced classes
# This is the baseline
train['wage'].value_counts(normalize=True)

 <=50K    0.75919
 >50K     0.24081
Name: wage, dtype: float64

In [6]:
# These are all continous
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
# Education num is the amount of years someone went to school?
# Capital-gain and hours-per-week are capped at 99,999 and 99
# Capital-gain there are 157 people at 99,999 and hours-per-week are 85 for 99
# 20 people with only 1 for hours-per-week
# 1836 who are working class ?
# 1843 whos occupation is a ?
# 583 native country ?

In [8]:
train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K


In [9]:
train['wage'] = pd.get_dummies(train['wage'], drop_first=True)

In [10]:
train['wage'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage, dtype: float64

In [11]:
train_col = ['age',
            'fnlwgt',
            'education-num',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
            'wage'    
            ]

In [12]:
for col in train[train_col]:
    outlier = abs(train[col].std() * 3) + abs(train[col].mean())
    train.drop(train[train[col] > outlier].index, inplace=True)

In [13]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,wage
count,30108.0,30108.0,30108.0,30108.0,30108.0,30108.0,30108.0
mean,38.226485,185488.998738,10.022486,567.151156,1.212037,39.784011,0.221503
std,13.424234,94667.976151,2.53823,2326.937114,32.648206,11.294637,0.415265
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,27.0,117606.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177817.0,10.0,0.0,0.0,40.0,0.0
75%,47.0,234723.5,12.0,0.0,0.0,45.0,0.0
max,79.0,506436.0,16.0,22040.0,1258.0,77.0,1.0


In [14]:
train.shape

(30108, 14)

In [2]:
print(f'Train csv shape is {train.shape}')
print(f'Test csv shape is {test.shape}')

NameError: name 'train' is not defined

## **Continous Modeling**

In [None]:
X = train[train.describe().columns[:-1]]
y = train['wage']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
X_train_sc

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train_sc, y_train)

In [None]:
rf.score(X_train_sc, y_train)

In [None]:
rf.score(X_test_sc, y_test)

In [None]:
rf_params = {
    'n_estimators': [250, 300],
    'max_depth': [7, 9, 12],
    'ccp_alpha' : [0.0001],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, n_jobs=-1)
gs.fit(X_train_sc, y_train)

In [None]:
gs.score(X_train_sc, y_train)

In [None]:
gs.score(X_test_sc, y_test)

In [None]:
gs.best_params_

In [None]:
cont_rf = RandomForestClassifier(ccp_alpha = 0.0001, max_depth = 9, n_estimators = 250)

In [None]:
cont_rf.fit(X_train_sc, y_train)

In [None]:
cont_rf.score(X_train_sc, y_train)

In [None]:
cont_rf.score(X_test_sc, y_test)

# Vizualizations

In [None]:
# Get predictions for Simple Logisitic Regression
preds = cont_rf.predict(X_test_sc)
# Confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [None]:
plot_confusion_matrix(cont_rf, X_test_sc, y_test, cmap='Blues', values_format='d');

In [None]:
feature_importances = pd.DataFrame(cont_rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [None]:
feature_importances