In [278]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score

In [211]:
train = pd.read_csv('../data/clean_train.csv')

In [212]:
train['wage_ >50K'].value_counts()

0    24720
1     7841
Name: wage_ >50K, dtype: int64

In [213]:
train.drop(columns='native-country_ Holand-Netherlands', inplace=True)

In [214]:
np.random.seed(42)

train = pd.concat([train, train[train['wage_ >50K'] == 1].sample(24720-7841, replace = True)], axis=0)

In [215]:
train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,wage_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [216]:
train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,wage_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [355]:

X = train.drop(columns='wage_ >50K')
y = train['wage_ >50K']

In [315]:
remaining = list(X.columns)
starting = list(X.columns)
for col in remaining:
    for col_2 in starting:
        X[f'{col} * {col_2}'] = X[f'{col}'] * X[f'{col_2}']
    remaining.remove(col)

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [317]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [318]:
sel = SelectFromModel(rf,prefit=True, threshold= 'mean')
good_features = list(X.columns[sel.get_support()])

In [320]:
len(X.columns)

4655

In [321]:
len(good_features)

425

In [322]:
rf.score(X_train, y_train), rf.score(X_test, y_test)

(1.0, 0.9224110032362459)

In [323]:
((rf.predict_proba(X_train)[:,0] - y_train)**2).mean(), ((rf.predict_proba(X_test)[:,0] - y_test)**2).mean()

(0.9152574541533202, 0.804525509708731)

In [324]:
roc_auc_score(y_train, rf.predict_proba(X_train)[:,1]), roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

(1.0, 0.9803732918129429)

In [325]:
X_selected = X[good_features]

In [326]:
list(X_selected.columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Prof-school',
 'education_ Some-college',
 'marital-status_ Married-civ-spouse',
 'marital-status_ Never-married',
 'occupation_ Adm-clerical',
 'occupation_ Craft-repair',
 'occupation_ Exec-managerial',
 'occupation_ Farming-fishing',
 'occupation_ Machine-op-inspct',
 'occupation_ Other-service',
 'occupation_ Prof-specialty',
 'occupation_ Protective-serv',
 'occupation_ Sales',
 'occupation_ Tech-support',
 'occupation_ Transport-moving',
 'relationship_ Not-in-family',
 'relationship_ Own-child',
 'relationship_ Unmarried',
 'relationship_ Wife',
 'sex_ Male',
 'native-country_ Mexico',
 'native-country

In [343]:
X_selected.reset_index(inplace=True)

In [344]:
X_selected.to_csv('../data/feature_selected.csv',index=False)

In [358]:
y= y.reset_index()

In [359]:
y.to_csv('../data/y_bootstrapped.csv', index=False)

(49440,)

In [334]:
X_train, X_test, y_train, y_test = train_test_split(X_selected,y, random_state = 2)

In [335]:
rf_2 = RandomForestClassifier(n_estimators=100)
rf_2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [336]:
rf_2.score(X_train, y_train), rf_2.score(X_test, y_test)

(0.9999730312837108, 0.9263754045307443)

In [337]:
((rf_2.predict_proba(X_train)[:,0] - y_train)**2).mean(), ((rf_2.predict_proba(X_test)[:,0] - y_test)**2).mean()

(0.9166325943906417, 0.8089722734627766)

In [338]:
roc_auc_score(y_train, rf_2.predict_proba(X_train)[:,1]), roc_auc_score(y_test, rf_2.predict_proba(X_test)[:,1])

(1.0, 0.9822717635006202)

In [291]:
y.to_csv('../data/test_engineered.csv', index=False)

  """Entry point for launching an IPython kernel.


In [231]:
test = pd.read_csv('../data/clean_test.csv')

In [232]:
X_submit = test

In [233]:
remaining = list(X_submit.columns)
starting = list(X_submit.columns)
for col in remaining:
    for col_2 in starting:
        X_submit[f'{col} * {col_2}'] = X_submit[f'{col}'] * X_submit[f'{col_2}']
    remaining.remove(col)

In [292]:
X_submit

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,...,native-country_ Yugoslavia * education_ HS-grad,native-country_ Yugoslavia * education_ Some-college,native-country_ Yugoslavia * marital-status_ Married-civ-spouse,native-country_ Yugoslavia * occupation_ Craft-repair,native-country_ Yugoslavia * occupation_ Exec-managerial,native-country_ Yugoslavia * occupation_ Farming-fishing,native-country_ Yugoslavia * occupation_ Other-service,native-country_ Yugoslavia * relationship_ Wife,native-country_ Yugoslavia * sex_ Male,native-country_ Yugoslavia * native-country_ Yugoslavia
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,28,336951,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,44,160323,10,7688,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,18,103497,10,0,0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,215419,13,0,0,36,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
16277,64,321403,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16278,38,374983,13,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
16279,44,83891,13,5455,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [298]:
rf_final = RandomForestClassifier()
rf_final.fit(X_selected, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [299]:
good_features_submit = [val for val in good_features if val in X_submit.columns]

In [300]:
X_submit = X_submit[good_features_submit]

In [301]:
submit = pd.DataFrame({'wage':rf_final.predict_proba(X_submit)[:,1]})

In [302]:
submit

Unnamed: 0,wage
0,0.0
1,0.1
2,0.5
3,0.9
4,0.0
...,...
16276,0.4
16277,0.1
16278,1.0
16279,0.6


In [303]:
submit.to_csv('../data/team5_1.csv', index=False)