In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('./data/cheap_train_sample.csv')
test = pd.read_csv('./data/test_data.csv')

In [3]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb



In [4]:
import warnings
warnings.filterwarnings('ignore')

## EDA

In [5]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,56,Private,346033,9th,5,Divorced,Adm-clerical,Not-in-family,Male,0,0,40,United-States,<=50K
1,28,Private,96226,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,Male,0,0,45,United-States,<=50K
2,33,Private,251120,Bachelors,13,Married-civ-spouse,Sales,Husband,Male,7688,0,50,United-States,>50K
3,26,Private,178140,Bachelors,13,Married-civ-spouse,Other-service,Husband,Male,0,0,45,United-States,>50K
4,40,Federal-gov,56795,Masters,14,Never-married,Exec-managerial,Not-in-family,Female,14084,0,55,United-States,>50K


In [6]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,Female,0,0,30,United-States


In [7]:
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
wage              0
dtype: int64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6513 entries, 0 to 6512
Data columns (total 14 columns):
age               6513 non-null int64
workclass         6513 non-null object
fnlwgt            6513 non-null int64
education         6513 non-null object
education-num     6513 non-null int64
marital-status    6513 non-null object
occupation        6513 non-null object
relationship      6513 non-null object
sex               6513 non-null object
capital-gain      6513 non-null int64
capital-loss      6513 non-null int64
hours-per-week    6513 non-null int64
native-country    6513 non-null object
wage              6513 non-null object
dtypes: int64(6), object(8)
memory usage: 712.4+ KB


In [9]:
train['wage'].value_counts()

 <=50K    4945
 >50K     1568
Name: wage, dtype: int64

In [10]:
train['wage'] = pd.get_dummies(train['wage'], drop_first = True)

In [11]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,56,Private,346033,9th,5,Divorced,Adm-clerical,Not-in-family,Male,0,0,40,United-States,0
1,28,Private,96226,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,Male,0,0,45,United-States,0
2,33,Private,251120,Bachelors,13,Married-civ-spouse,Sales,Husband,Male,7688,0,50,United-States,1
3,26,Private,178140,Bachelors,13,Married-civ-spouse,Other-service,Husband,Male,0,0,45,United-States,1
4,40,Federal-gov,56795,Masters,14,Never-married,Exec-managerial,Not-in-family,Female,14084,0,55,United-States,1
5,66,Private,284021,HS-grad,9,Widowed,Sales,Not-in-family,Female,0,0,40,United-States,0
6,30,Private,318749,Assoc-voc,11,Married-civ-spouse,Tech-support,Wife,Female,0,0,35,Germany,0
7,43,Private,456236,Masters,14,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,50,United-States,1
8,56,Private,244605,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,Male,0,0,40,United-States,0
9,29,Private,137063,HS-grad,9,Never-married,Sales,Unmarried,Male,0,0,38,United-States,0


In [12]:
train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'wage'],
      dtype='object')

In [13]:
train['wage'] = train['wage'].astype(int)

In [14]:
train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
wage               int64
dtype: object

## Education Values

In [15]:
train['education'].value_counts()

 HS-grad         2103
 Some-college    1451
 Bachelors       1113
 Masters          334
 Assoc-voc        250
 11th             225
 Assoc-acdm       222
 10th             175
 7th-8th          142
 9th              106
 Prof-school      103
 12th              89
 Doctorate         81
 5th-6th           79
 1st-4th           27
 Preschool         13
Name: education, dtype: int64

In [16]:
train[train['education'] == ' Masters']['wage'].value_counts(normalize = True)

1    0.54491
0    0.45509
Name: wage, dtype: float64

In [17]:
train[train['education'] == ' Masters']['wage'].mean()

0.5449101796407185

In [18]:
train[train['education'] == ' Doctorate']['wage'].value_counts(normalize = True)

1    0.777778
0    0.222222
Name: wage, dtype: float64

In [19]:
train[train['education'] == ' HS-grad']['wage'].value_counts(normalize = True)

0    0.840704
1    0.159296
Name: wage, dtype: float64

In [20]:
train[train['education'] == ' Some-college']['wage'].value_counts(normalize = True)

0    0.796003
1    0.203997
Name: wage, dtype: float64

In [21]:
train[train['education'] == ' Some-college']['wage'].mean()

0.2039972432804962

In [22]:
train[train['education'] == ' Assoc-voc']['wage'].value_counts(normalize = True)

0    0.732
1    0.268
Name: wage, dtype: float64

In [23]:
train[train['education'] == ' 11th']['wage'].value_counts(normalize = True)

0    0.951111
1    0.048889
Name: wage, dtype: float64

In [24]:
train[train['education'] == ' 10th']['wage'].value_counts(normalize = True)

0    0.914286
1    0.085714
Name: wage, dtype: float64

In [25]:
train[train['education'] == ' 7th-8th']['wage'].value_counts(normalize = True)

0    0.929577
1    0.070423
Name: wage, dtype: float64

In [26]:
train[train['education'] == ' 5th-6th']['wage'].value_counts(normalize = True)

0    0.962025
1    0.037975
Name: wage, dtype: float64

In [27]:
train[train['education'] == ' 1st-4th']['wage'].value_counts(normalize = True)

0    0.925926
1    0.074074
Name: wage, dtype: float64

In [28]:
train[train['education'] == ' Preschool']['wage'].value_counts(normalize = True)

0    1.0
Name: wage, dtype: float64

In [29]:
train[train['education'] == ' 9th']['wage'].value_counts(normalize = True)

0    0.971698
1    0.028302
Name: wage, dtype: float64

In [30]:
train[train['education'] == ' Assoc-acdm']['wage'].value_counts(normalize = True)

0    0.765766
1    0.234234
Name: wage, dtype: float64

In [31]:
train[train['education'] == ' Prof-school']['wage'].value_counts(normalize = True)

1    0.728155
0    0.271845
Name: wage, dtype: float64

In [32]:
train[train['education'] == ' Bachelors']['wage'].value_counts(normalize = True)

0    0.598383
1    0.401617
Name: wage, dtype: float64

In [33]:
train[train['education'] == ' 12th']['wage'].value_counts(normalize = True)

0    0.921348
1    0.078652
Name: wage, dtype: float64

In [34]:
train['education'] = train['education'].map({
    ' HS-grad': 15.93,
    ' Some-college': 20.40,
    ' Bachelors': 40.16,
    ' Masters': 54.49,
    ' Assoc-voc': 26.8,
    ' 11th': 4.89,
    ' Assoc-acdm':23.42,
    ' 10th': 8.57,
    ' 7th-8th': 7.04,
    ' 9th': 2.83,
    ' Prof-school': 72.82,
    ' 12th': 7.87,
    ' Doctorate': 77.78,
    ' 5th-6th': 3.80,
    ' 1st-4th': 7.41,
    ' Preschool': 0
})

In [35]:
train.dtypes

age                 int64
workclass          object
fnlwgt              int64
education         float64
education-num       int64
marital-status     object
occupation         object
relationship       object
sex                object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
wage                int64
dtype: object

In [36]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,56,Private,346033,2.83,5,Divorced,Adm-clerical,Not-in-family,Male,0,0,40,United-States,0
1,28,Private,96226,15.93,9,Married-civ-spouse,Craft-repair,Husband,Male,0,0,45,United-States,0
2,33,Private,251120,40.16,13,Married-civ-spouse,Sales,Husband,Male,7688,0,50,United-States,1
3,26,Private,178140,40.16,13,Married-civ-spouse,Other-service,Husband,Male,0,0,45,United-States,1
4,40,Federal-gov,56795,54.49,14,Never-married,Exec-managerial,Not-in-family,Female,14084,0,55,United-States,1
5,66,Private,284021,15.93,9,Widowed,Sales,Not-in-family,Female,0,0,40,United-States,0
6,30,Private,318749,26.80,11,Married-civ-spouse,Tech-support,Wife,Female,0,0,35,Germany,0
7,43,Private,456236,54.49,14,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,50,United-States,1
8,56,Private,244605,15.93,9,Married-civ-spouse,Craft-repair,Husband,Male,0,0,40,United-States,0
9,29,Private,137063,15.93,9,Never-married,Sales,Unmarried,Male,0,0,38,United-States,0


In [37]:
train.corr()[['wage']].sort_values(by = 'wage', ascending = False)

Unnamed: 0,wage
wage,1.0
education,0.357463
education-num,0.323418
hours-per-week,0.255101
age,0.241763
capital-gain,0.221365
capital-loss,0.136276
fnlwgt,-0.022836


In [38]:
train_dummies = pd.get_dummies(data = train, drop_first = True)

In [39]:
train_dummies.corr()[['wage']].sort_values(by = 'wage', ascending = False)

Unnamed: 0,wage
wage,1.000000
marital-status_ Married-civ-spouse,0.449148
education,0.357463
education-num,0.323418
hours-per-week,0.255101
age,0.241763
sex_ Male,0.229283
occupation_ Exec-managerial,0.227419
capital-gain,0.221365
occupation_ Prof-specialty,0.181904


## Basic Modeling

In [40]:
lr = LogisticRegressionCV()
mnb = MultinomialNB()
knn = KNeighborsClassifier()
tree = DecisionTreeClassifier()
bag = BaggingClassifier()
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
svc = SVC()
xgb = xgb.XGBClassifier()
gb = GradientBoostingClassifier()


X = train[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = train['wage']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [41]:
def fit_and_score(model):
    model.fit(X_train, y_train)
    print(f"Training score: {model.score(X_train, y_train)}")
    print(f"Testing score: {model.score(X_test, y_test)}")

In [42]:
fit_and_score(lr)

Training score: 0.7936117936117936
Testing score: 0.7900552486187845


In [43]:
fit_and_score(mnb)

Training score: 0.7843980343980343
Testing score: 0.7833026396562308


In [44]:
fit_and_score(knn)

Training score: 0.8196150696150696
Testing score: 0.7575199508901166


In [45]:
fit_and_score(tree)

Training score: 0.9991809991809992
Testing score: 0.7513812154696132


In [46]:
fit_and_score(bag)

Training score: 0.976044226044226
Testing score: 0.7931246163290362


In [47]:
fit_and_score(rf)

Training score: 0.9795249795249795
Testing score: 0.7949662369551872


In [48]:
fit_and_score(ada)

Training score: 0.8286240786240786
Testing score: 0.8213627992633518


In [49]:
fit_and_score(svc)

Training score: 0.995085995085995
Testing score: 0.7593615715162676


# Pulling in Dummied and Cleaned DFs

In [50]:
train_clean = pd.read_csv('train_df_clean')
test_clean = pd.read_csv('test_df_clean')

In [51]:
train_clean.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male,native-country_ Cambodia,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,native-country_ Dominican-Republic,native-country_ Ecuador,native-country_ El-Salvador,native-country_ England,native-country_ France,native-country_ Germany,native-country_ Greece,native-country_ Guatemala,native-country_ Haiti,native-country_ Honduras,native-country_ Hong,native-country_ Hungary,native-country_ India,native-country_ Iran,native-country_ Ireland,native-country_ Italy,native-country_ Jamaica,native-country_ Japan,native-country_ Laos,native-country_ Mexico,native-country_ Nicaragua,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Peru,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,wage
0,56,346033,5,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,28,96226,9,0,0,45,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,33,251120,13,7688,0,50,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,26,178140,13,0,0,45,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,40,56795,14,14084,0,55,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [52]:
train_clean.corr()[['wage']].sort_values(by = 'wage', ascending = False)

Unnamed: 0,wage
wage,1.000000
marital-status_ Married-civ-spouse,0.449148
education-num,0.323418
hours-per-week,0.255101
age,0.241763
sex_ Male,0.229283
occupation_ Exec-managerial,0.227419
capital-gain,0.221365
occupation_ Prof-specialty,0.181904
education_ Bachelors,0.170823


In [53]:
train_clean.shape

(6513, 96)

In [54]:
test_clean.shape

(16281, 95)

In [55]:
X = train_dummies.drop('wage', axis = 1)
y = train_dummies['wage']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [91]:
X = train_clean.drop('wage', axis = 1)
y = train_clean['wage']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

In [92]:
fit_and_score(lr)

Training score: 0.7940212940212941
Testing score: 0.7888275015346838


In [93]:
fit_and_score(mnb)

Training score: 0.7843980343980343
Testing score: 0.7833026396562308


In [94]:
fit_and_score(knn)

Training score: 0.8187960687960688
Testing score: 0.7569060773480663


In [95]:
fit_and_score(tree)

Training score: 1.0
Testing score: 0.805402087170043


In [96]:
fit_and_score(bag)

Training score: 0.9881244881244882
Testing score: 0.8453038674033149


In [97]:
fit_and_score(ada)

Training score: 0.8617936117936118
Testing score: 0.858195211786372


In [98]:
fit_and_score(rf)

Training score: 0.9868959868959869
Testing score: 0.8336402701043585


In [99]:
fit_and_score(svc)

Training score: 0.9918099918099919
Testing score: 0.7550644567219152


In [100]:
fit_and_score(gb)

Training score: 0.8755118755118755
Testing score: 0.8557397176181707


## F1-Score

In [101]:
def find_f1score(model):
    model.fit(X_train, y_train)
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    train_sc = f1_score(y_train, pred_train)
    test_sc = f1_score(y_test, pred_test)
    
    print(f"Train F1-Score: {train_sc}")
    print(f"Test F1-Score: {test_sc}")

In [102]:
find_f1score(lr)

Train F1-Score: 0.36649874055415615
Test F1-Score: 0.3558052434456929


In [103]:
find_f1score(mnb)

Train F1-Score: 0.34717916924984504
Test F1-Score: 0.32504780114722753


In [104]:
find_f1score(knn)

Train F1-Score: 0.4910868315123635
Test F1-Score: 0.3465346534653465


In [105]:
find_f1score(tree)

Train F1-Score: 1.0
Test F1-Score: 0.5965346534653466


In [106]:
find_f1score(bag)

Train F1-Score: 0.9708315193730954
Test F1-Score: 0.6308539944903581


In [107]:
find_f1score(rf)

Train F1-Score: 0.9748482220294882
Test F1-Score: 0.6453488372093024


In [108]:
find_f1score(ada)

Train F1-Score: 0.683247301736274
Test F1-Score: 0.6760168302945301


In [109]:
find_f1score(svc)

Train F1-Score: 0.9826989619377162
Test F1-Score: 0.02444987775061125


In [110]:
find_f1score(gb)

Train F1-Score: 0.7074109720885466
Test F1-Score: 0.6647646219686162


In [111]:
ada.fit(X_train, y_train)
ada_pred_train = ada.predict(X_train)
ada_pred_test = ada.predict(X_test)

In [112]:
cm_train = confusion_matrix(y_train, ada_pred_train)

In [113]:
cm_train_df = pd.DataFrame(cm_train, columns=['pred <=50k', 'pred >50k'], index=['actual <=50k', 'actual >50k'])
cm_train_df

Unnamed: 0,pred <=50k,pred >50k
actual <=50k,3481,227
actual >50k,448,728


In [114]:
cm_test = confusion_matrix(y_test, ada_pred_test)

In [115]:
cm_test_df = pd.DataFrame(cm_test, columns=['pred <=50k', 'pred >50k'], index=['actual <=50k', 'actual >50k'])
cm_test_df

Unnamed: 0,pred <=50k,pred >50k
actual <=50k,1157,80
actual >50k,151,241


In [173]:
tree.

array([1.24826793e-01, 1.54816650e-01, 9.94364985e-02, 1.08083892e-01,
       3.72495009e-02, 6.91237421e-02, 5.12658075e-03, 9.08053681e-03,
       0.00000000e+00, 1.12395708e-02, 5.80826140e-03, 1.23271280e-02,
       6.99340966e-03, 0.00000000e+00, 2.01458270e-03, 2.17783086e-04,
       6.09792641e-04, 1.86671216e-04, 6.53760554e-04, 5.60013649e-04,
       3.71906501e-03, 3.97688966e-03, 5.10040703e-03, 0.00000000e+00,
       4.60195832e-03, 3.67831188e-03, 0.00000000e+00, 2.81408406e-03,
       7.64198989e-03, 1.56373534e-03, 1.93917930e-01, 0.00000000e+00,
       2.97461796e-03, 1.94257247e-03, 0.00000000e+00, 6.31414025e-03,
       0.00000000e+00, 8.72863291e-03, 1.49484069e-02, 4.59265255e-03,
       1.51375548e-03, 6.04990517e-03, 4.67572455e-03, 0.00000000e+00,
       8.84222467e-03, 3.26372610e-03, 1.01003136e-02, 5.47491817e-03,
       5.62997928e-03, 2.28575727e-03, 0.00000000e+00, 8.64021059e-04,
       0.00000000e+00, 1.41799705e-03, 1.25221182e-02, 0.00000000e+00,
      

In [184]:
df = pd.DataFrame({'variable':X_train.columns,
                   'importance':tree.feature_importances_,
                  })

In [185]:
df[['variable', 'importance']].sort_values(by = 'importance', ascending = False)

Unnamed: 0,variable,importance
30,marital-status_ Married-civ-spouse,0.193918
1,fnlwgt,0.154817
0,age,0.124827
3,capital-gain,0.108084
2,education-num,0.099436
5,hours-per-week,0.069124
4,capital-loss,0.037250
38,occupation_ Exec-managerial,0.014948
54,sex_ Male,0.012522
11,workclass_ Self-emp-not-inc,0.012327


In [None]:
DecisionTreeClassifier()

In [116]:
ada = AdaBoostClassifier()

In [117]:
ada_params = {
    'n_estimators': [25, 50, 75, 100, 125, 150],
    'learning_rate': [.0001, .001, .01, .1, 1],
    'random_state': [42]
}

gs = GridSearchCV(ada, param_grid = ada_params, n_jobs = -1, cv = 3, verbose = 1)

In [118]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   21.8s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1],
                         'n_estimators': [25, 50, 75, 100, 125, 150],
                         'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [119]:
gs.best_params_

{'learning_rate': 1, 'n_estimators': 100, 'random_state': 42}

In [120]:
gs.score(X_train, y_train)

0.8628173628173628

In [121]:
gs.score(X_test, y_test)

0.8606507059545734

In [123]:
test_clean

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male,native-country_ Cambodia,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,native-country_ Dominican-Republic,native-country_ Ecuador,native-country_ El-Salvador,native-country_ England,native-country_ France,native-country_ Germany,native-country_ Greece,native-country_ Guatemala,native-country_ Haiti,native-country_ Honduras,native-country_ Hong,native-country_ Hungary,native-country_ India,native-country_ Iran,native-country_ Ireland,native-country_ Italy,native-country_ Jamaica,native-country_ Japan,native-country_ Laos,native-country_ Mexico,native-country_ Nicaragua,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Peru,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,25,226802,7,0,0,40,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,18,103497,10,0,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,0,0,30,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,29,227026,9,0,0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,63,104626,15,3103,0,32,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,24,369667,10,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9,55,104996,4,0,0,10,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [138]:
test_final = pd.DataFrame()

In [139]:
test_final

In [163]:
probs = gs.predict_proba(test_clean)

probs_df = pd.DataFrame(probs)

probs_df.drop(0, axis = 1, inplace = True)

probs_df.rename(mapper = {1: 'wage'}, axis = 1, inplace = True)

In [172]:
probs_df[probs_df['wage'] < 0.40]

Unnamed: 0,wage
4,0.393222
38,0.398623
50,0.392625
75,0.395839
109,0.393572
110,0.397543
121,0.394658
141,0.392933
168,0.393640
230,0.395370


In [None]:
rf.