In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [167]:
df = pd.read_csv('adult_data.csv',names = ['Age','Workclass','Final Weight','Education',
                                      'Years of study','Marital-status','Occupation',
                                      'Relationship','Race','Sex','Capital-gain',
                                      'Capital-loss','Hours-per-week','Native-country','class'],index_col=False )

In [168]:
df.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Years of study,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,class
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38819 entries, 0 to 38818
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             38819 non-null  int64  
 1   Workclass       38819 non-null  object 
 2   Final Weight    38818 non-null  float64
 3   Education       38818 non-null  object 
 4   Years of study  38818 non-null  float64
 5   Marital-status  38818 non-null  object 
 6   Occupation      38818 non-null  object 
 7   Relationship    38818 non-null  object 
 8   Race            38818 non-null  object 
 9   Sex             38818 non-null  object 
 10  Capital-gain    38818 non-null  float64
 11  Capital-loss    38818 non-null  float64
 12  Hours-per-week  38818 non-null  float64
 13  Native-country  38818 non-null  object 
 14  class           38818 non-null  object 
dtypes: float64(5), int64(1), object(9)
memory usage: 3.1+ MB


In [170]:
features_int = ['Age','Final Weight','Years of study','Capital-gain','Capital-loss','Hours-per-week','class']
features_cat = [x for x in df.columns if x not in features_int]

In [171]:
df[features_cat].astype(str).fillna('None')

Unnamed: 0,Workclass,Education,Marital-status,Occupation,Relationship,Race,Sex,Native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
38814,Private,Some-college,Married-civ-spouse,Transport-moving,Husband,White,Male,United-States
38815,Private,Some-college,Separated,Prof-specialty,Unmarried,White,Female,United-States
38816,Self-emp-not-inc,Some-college,Divorced,Exec-managerial,Unmarried,White,Male,United-States
38817,State-gov,Some-college,Never-married,Tech-support,Own-child,White,Male,United-States


In [172]:
df.drop(38818, axis = 0,inplace = True)

In [173]:
def counts(col):
    print(df[col].value_counts())
    
for i in features_cat:
    print('='*50)
    print(i)
    print('='*50)
    counts(i)
    

Workclass
Private             26962
Self-emp-not-inc     3035
Local-gov            2483
?                    2230
State-gov            1591
Self-emp-inc         1352
Federal-gov          1142
Without-pay            16
Never-worked            7
Name: Workclass, dtype: int64
Education
HS-grad         12484
Some-college     8696
Bachelors        6399
Masters          2076
Assoc-voc        1650
11th             1412
Assoc-acdm       1255
10th             1132
7th-8th           777
Prof-school       672
9th               609
12th              510
Doctorate         485
5th-6th           396
1st-4th           200
Preschool          65
Name: Education, dtype: int64
Marital-status
Married-civ-spouse       17764
Never-married            12802
Divorced                  5293
Separated                 1222
Widowed                   1211
Married-spouse-absent      498
Married-AF-spouse           28
Name: Marital-status, dtype: int64
Occupation
Prof-specialty       4938
Craft-repair         4856
Exec

In [174]:
for i in features_cat:
    df.loc[
        df[i].value_counts()[
        df[i]].values < 10,i] = 'rare'
    

In [175]:
df['class'].value_counts() # cross check

<=50K    29496
>50K      9322
Name: class, dtype: int64

In [176]:
dic = {'<=50K':0, '>50K':1}

In [177]:
df['class'] = df['class'].map(dic)

In [178]:
df['class'].value_counts()

0    29496
1     9322
Name: class, dtype: int64

In [180]:
X = df.drop('class',axis = 1)
Y = df['class']

In [181]:
from sklearn.model_selection import train_test_split

x_tr,x_val,y_tr,y_val = train_test_split(X,Y,test_size = 0.2,stratify = Y,random_state = 43)

In [182]:
x_tr.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Years of study,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country
3468,38,Local-gov,116580.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,White,Female,0.0,0.0,20.0,United-States
18082,28,Self-emp-not-inc,190391.0,Bachelors,13.0,Never-married,Sales,Not-in-family,White,Male,0.0,0.0,50.0,United-States
32110,33,Private,263561.0,HS-grad,9.0,Divorced,Craft-repair,Not-in-family,White,Male,0.0,0.0,60.0,United-States
26558,44,State-gov,166597.0,Masters,14.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,0.0,0.0,40.0,United-States
2278,17,Private,329783.0,10th,6.0,Never-married,Sales,Other-relative,White,Female,0.0,0.0,10.0,United-States


In [184]:
from sklearn.preprocessing import LabelEncoder

for col in features_cat:
    le = LabelEncoder()
    le.fit(x_tr[col])
    x_tr.loc[:,col] = le.transform(x_tr[col])
    x_val.loc[:,col] = le.transform(x_val[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [185]:
x_tr.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,Years of study,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country
3468,38,2,116580.0,9,13.0,2,10,5,4,0,0.0,0.0,20.0,38
18082,28,5,190391.0,9,13.0,4,12,1,4,1,0.0,0.0,50.0,38
32110,33,3,263561.0,11,9.0,0,3,1,4,1,0.0,0.0,60.0,38
26558,44,6,166597.0,12,14.0,2,1,0,4,1,0.0,0.0,40.0,38
2278,17,3,329783.0,0,6.0,4,12,2,4,0,0.0,0.0,10.0,38


In [193]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [190]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

#print(random_grid)

model = RandomForestClassifier()

clf = RandomizedSearchCV(model,random_grid,scoring='roc_auc', \
                        n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs =-1)
op = clf.fit(x_tr,y_tr)
print(op.best_params_)
op.best_score_



Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.6min finished


{'n_estimators': 1100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 15}


0.917544390835894

In [197]:
clf_rf = RandomForestClassifier(n_estimators= 1100, min_samples_split= 10, min_samples_leaf= 2,
                                max_features= 'sqrt', max_depth= 15,n_jobs = -1,random_state = 7)
clf_rf.fit(x_tr,y_tr)
y_tr_pred = clf_rf.predict_proba(x_tr)[:,1]
y_val_pred = clf_rf.predict_proba(x_val)[:,1]
print('train score:',roc_auc_score(y_tr,y_tr_pred))
print('test score:',roc_auc_score(y_val,y_val_pred))


train score: 0.9549811912750846
test score: 0.9218460391358114
