In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
train = pd.read_csv('../data/large_train_sample.csv')
test = pd.read_csv('../data/test_data.csv')

In [3]:
print(f'Train csv shape is {train.shape}')
print(f'Train csv shape is {test.shape}')

Train csv shape is (32561, 14)
Train csv shape is (16281, 13)


In [4]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [5]:
def columns_values(dataframe):
    for n in dataframe.columns:                               # cycle through the columns in dataframe
        column = n  
        value_counts = (dataframe[column].value_counts())     # run value counts for each column
        unique_values = (dataframe[column].unique())          # run unique values for each column 
        len_counts = len(value_counts)                        # how many items are counted in value counts
        len_unique = len(unique_values)                       # how many items are counted in unique values
        if (len(value_counts)< 100):                          # ignore columns where counts over 100
                print(f"{column} has length of {len_unique}") # print how many unique values there are
                print(unique_values)                          # print unique values for column 
                print("="*30)                                  
                print(f"{column} has length of {len_counts}") # print how many values are counted
                print(value_counts)                           # print unique values counted
                print("="*60)

In [6]:
columns_values(train)

age has length of 73
[39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
age has length of 73
36    898
31    888
34    886
23    877
35    876
33    875
28    867
30    861
37    858
25    841
27    835
32    828
38    827
39    816
29    813
41    808
24    798
40    794
26    785
42    780
43    770
22    765
20    753
46    737
45    734
44    724
21    720
19    712
47    708
50    602
51    595
49    577
18    550
48    543
52    478
53    464
55    419
54    415
17    395
58    366
56    366
57    358
59    355
60    312
61    300
62    258
63    230
64    208
65    178
67    151
66    150
68    120
69    108
70     89
71     72
72     67
73     64
74     51
76     46
75     45
90     43
77     29
78     23
80     22
79     22
81     20
82     12
84     10
83      6
85      3
88      3
87      1
86      1
Name

In [7]:
train['wage'] = pd.get_dummies(train['wage'], drop_first=True)

In [8]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,wage
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [9]:
dums_train = pd.get_dummies(train[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']], drop_first=True)
final_train = train.join(dums_train)
final_train.drop(columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country'], inplace=True)

In [10]:
dums_train.shape

(32561, 90)

In [11]:
def dummies(train, test):
    dums_train = pd.get_dummies(train[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']], drop_first=True)
    dums_test = pd.get_dummies(test[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']], drop_first=True)
    final_train = train.join(dums_train)
    final_test = test.join(dums_test)
    final_train.drop(columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country'], inplace=True)
    final_test.drop(columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country'], inplace=True)
    return final_train, final_test

In [12]:
train_df, test_df = dummies(train, test)

In [13]:
train_df.shape

(32561, 97)

In [14]:
test_df.shape

(16281, 95)

In [15]:
train_df = train_df.drop(columns='native-country_ Holand-Netherlands')

In [16]:
X = train_df.drop(columns=['wage'])
y = train_df['wage']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [18]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [20]:
X_train_sc

array([[-0.7741844 ,  0.40140267, -0.02973945, ...,  0.34146461,
        -0.04437875, -0.0181027 ],
       [-0.2602043 ,  1.373397  ,  0.74367621, ...,  0.34146461,
        -0.04437875, -0.0181027 ],
       [-1.06788732, -0.47482015, -0.02973945, ..., -2.92856118,
        -0.04437875, -0.0181027 ],
       ...,
       [ 1.20831026, -1.0216555 , -0.41644728, ...,  0.34146461,
        -0.04437875, -0.0181027 ],
       [-0.92103586, -1.32428816,  1.13038404, ...,  0.34146461,
        -0.04437875, -0.0181027 ],
       [-1.43501596,  2.12561079, -1.57657076, ...,  0.34146461,
        -0.04437875, -0.0181027 ]])

In [21]:
rf = RandomForestClassifier()

In [22]:
rf.fit(X_train_sc, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
rf.score(X_train_sc, y_train)

0.9999590499590499

In [27]:
rf.score(X_test_sc, y_test)

0.8537034762314212