## **Task** 
The task is to predict if a person's income is in excess of 50,000 given certain profile information, and more specifically to generate the labels for income being above 50,000 for each row in the test set. This will simply be a csv with a single column of the predictions [0,1] with 'wage' as the column header. One member from each group will submit the link to your group's GitHub repository by TBD with the csv file complete, with final changes made by 12:00am EST / 9:00pm PST.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, f1_score, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('./data/large_train_sample.csv')
test = pd.read_csv('./data/test_data.csv')

In [3]:
print(f'Train csv shape is {train.shape}')
print(f'Test csv shape is {test.shape}')

Train csv shape is (32561, 14)
Test csv shape is (16281, 13)


In [4]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [5]:
# We have imbalanced classes
# This is the baseline
train['wage'].value_counts(normalize=True)

 <=50K    0.75919
 >50K     0.24081
Name: wage, dtype: float64

In [6]:
# These are all continous
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
# Education num is the amount of years someone went to school?
# Capital-gain and hours-per-week are capped at 99,999 and 99
# Capital-gain there are 157 people at 99,999 and hours-per-week are 85 for 99
# 20 people with only 1 for hours-per-week
# 1836 who are working class ?
# 1843 whos occupation is a ?
# 583 native country ?

In [8]:
train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K


In [9]:
train['wage'] = pd.get_dummies(train['wage'], drop_first=True)

In [10]:
train['wage'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage, dtype: float64

In [11]:
train_col = ['age',
            'fnlwgt',
            'education-num',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
            'wage'    
            ]

In [12]:
for col in train[train_col]:
    outlier = abs(train[col].std() * 3) + abs(train[col].mean())
    train.drop(train[train[col] > outlier].index, inplace=True)

In [13]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,wage
count,30108.0,30108.0,30108.0,30108.0,30108.0,30108.0,30108.0
mean,38.226485,185488.998738,10.022486,567.151156,1.212037,39.784011,0.221503
std,13.424234,94667.976151,2.53823,2326.937114,32.648206,11.294637,0.415265
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,27.0,117606.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177817.0,10.0,0.0,0.0,40.0,0.0
75%,47.0,234723.5,12.0,0.0,0.0,45.0,0.0
max,79.0,506436.0,16.0,22040.0,1258.0,77.0,1.0


In [14]:
train.shape

(30108, 14)

In [38]:
def dummies(train, test):    
    dums_train = pd.get_dummies(train[['education', 'marital-status', 'occupation', 'relationship', 'sex']], drop_first=True)
    dums_test = pd.get_dummies(test[['education', 'marital-status', 'occupation', 'relationship', 'sex']], drop_first=True)
    
    final_train = train.join(dums_train)
    final_test = test.join(dums_test)
    
    final_train.drop(columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country'], inplace=True)
    final_test.drop(columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country'], inplace=True)
    
    return final_train, final_test
    

In [39]:
train_df, test_df = dummies(train, test)

In [40]:
train_df.head(1)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,wage,education_ 11th,education_ 12th,education_ 1st-4th,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [41]:
test_df.head(1)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male
0,25,226802,7,0,0,40,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [42]:
print(f'Train_df csv shape is {train_df.shape}')
print(f'Test_df csv shape is {test_df.shape}')

Train_df csv shape is (30108, 48)
Test_df csv shape is (16281, 47)


## **Best Model**

In [43]:
X = train_df.drop(columns='wage')
y = train_df['wage']

In [44]:
print(X.shape)
print(y.shape)

(30108, 47)
(30108,)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [46]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [47]:
X_train_sc

array([[ 0.28264404, -0.6756093 , -0.00559563, ..., -0.35069342,
        -0.22575079,  0.71352821],
       [-0.01489265,  0.75488654, -0.39922502, ..., -0.35069342,
        -0.22575079,  0.71352821],
       [-1.57696024,  0.05838372, -1.18648379, ..., -0.35069342,
        -0.22575079,  0.71352821],
       ...,
       [-0.46119767,  0.09681328, -0.39922502, ..., -0.35069342,
        -0.22575079,  0.71352821],
       [ 0.43141238,  1.27407087, -0.39922502, ..., -0.35069342,
        -0.22575079,  0.71352821],
       [-1.50257607,  1.03027905, -1.18648379, ..., -0.35069342,
        -0.22575079,  0.71352821]])

In [48]:
rf = RandomForestClassifier()

In [49]:
rf.fit(X_train_sc, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
rf.score(X_train_sc, y_train)

0.9999557149816217

In [51]:
rf.score(X_test_sc, y_test)

0.8436296001062841

In [69]:
rf_params = {
    'n_estimators': [300, 350, 400],
    'max_depth': [17, 22],
#     'learning_rate' : (np.logspae(-1.6, -1, 20)),
    'ccp_alpha' : [0.0001, .00001],
#     'gamma' : np.logspace(-3,3,7)
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5, n_jobs=2)
gs.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [70]:
gs.score(X_train_sc, y_train)

0.8792790399008016

In [71]:
gs.score(X_test_sc, y_test)

0.8624950179354325

In [72]:
gs.best_params_

{'ccp_alpha': 0.0001, 'max_depth': 17, 'n_estimators': 400}

In [73]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [74]:
feature_importances

Unnamed: 0,importance
fnlwgt,0.220893
age,0.18927
capital-gain,0.106388
hours-per-week,0.095753
marital-status_ Married-civ-spouse,0.086335
education-num,0.066039
marital-status_ Never-married,0.032405
occupation_ Exec-managerial,0.018573
relationship_ Not-in-family,0.016408
occupation_ Prof-specialty,0.01548


## **Predictions**

In [104]:
# Scale the testing data
s_test = StandardScaler()
scaled_test = s_test.fit_transform(test_df)

In [129]:
y_preds = gs.predict(scaled_test)

In [130]:
wage_predictions = pd.DataFrame({'wage': y_preds})

In [131]:
wage_predictions.to_csv('./data/wage_predictions.csv', index=False)

In [132]:
wage_predictions

Unnamed: 0,wage
0,0
1,0
2,0
3,0
4,0
...,...
16276,0
16277,0
16278,0
16279,0


In [127]:
wage_predictions.value_counts(normalize=True)

wage
0       0.980468
1       0.019532
dtype: float64