In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [42]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain',
'capital_loss', 'hours_per_week', 'native_country', 'wage_class']

In [43]:
train_set = pd.read_csv('adult.data', header = None , names = col_labels)
test_set = pd.read_csv('adult.test',skiprows = 1, header = None , names = col_labels )

In [44]:
train_set.head()
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [54]:
Workclass = pd.get_dummies(train_set['workclass'],drop_first = True)
Education = pd.get_dummies(train_set['education'],drop_first = True)
Marital_Status = pd.get_dummies(train_set['marital_status'],drop_first = True)
Occupation = pd.get_dummies(train_set['occupation'],drop_first = True)
Relationship = pd.get_dummies(train_set['relationship'],drop_first = True)
Race = pd.get_dummies(train_set['race'],drop_first = True)
Sex = pd.get_dummies(train_set['sex'],drop_first = True)


In [55]:
train_set = train_set.drop(['workclass','education','marital_status','occupation','relationship','race','sex','native_country'] , axis = 1)

In [56]:
train_set = pd.concat([train_set,Workclass,Education,Marital_Status,Occupation,Relationship,Race,Sex] , axis = 1)

In [57]:
Workclass = pd.get_dummies(test_set['workclass'],drop_first = True)
Education = pd.get_dummies(test_set['education'],drop_first = True)
Marital_Status = pd.get_dummies(test_set['marital_status'],drop_first = True)
Occupation = pd.get_dummies(test_set['occupation'],drop_first = True)
Relationship = pd.get_dummies(test_set['relationship'],drop_first = True)
Race = pd.get_dummies(test_set['race'],drop_first = True)
Sex = pd.get_dummies(test_set['sex'],drop_first = True)

test_set = test_set.drop(['workclass','education','marital_status','occupation','relationship','race','sex','native_country'] , axis = 1)

test_set = pd.concat([test_set,Workclass,Education,Marital_Status,Occupation,Relationship,Race,Sex] , axis = 1)

In [7]:
test_set['workclass'].unique()

array([' Private', ' Local-gov', ' ?', ' Self-emp-not-inc',
       ' Federal-gov', ' State-gov', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [8]:
train_set['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [60]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
train_set['wage_class'] = labelencoder.fit_transform(train_set['wage_class'])
train_set.head(10)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,wage_class,Federal-gov,Local-gov,Never-worked,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Asian-Pac-Islander,Black,Other,White,Male
0,39,77516,13,2174,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,38,215646,9,0,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
5,37,284582,14,0,0,40,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,49,160187,5,0,0,16,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
7,52,209642,9,0,0,45,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
8,31,45781,14,14084,0,50,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
9,42,159449,13,5178,0,40,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [61]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
test_set['wage_class'] = labelencoder.fit_transform(test_set['wage_class'])
test_set.head(10)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,wage_class,Federal-gov,Local-gov,Never-worked,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Asian-Pac-Islander,Black,Other,White,Male
0,25,226802,7,0,0,40,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
1,38,89814,9,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,28,336951,12,0,0,40,1,0,1,0,...,0,0,0,0,0,0,0,0,1,1
3,44,160323,10,7688,0,40,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,18,103497,10,0,0,30,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,34,198693,6,0,0,30,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
6,29,227026,9,0,0,40,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
7,63,104626,15,3103,0,32,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
8,24,369667,10,0,0,40,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
9,55,104996,4,0,0,10,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [62]:
X_train = train_set.loc[:, train_set.columns != 'wage_class']
y_train = train_set.loc[:,'wage_class']

X_test = test_set.loc[:, test_set.columns != 'wage_class']
y_test = test_set.loc[:,'wage_class']


In [64]:
X_train.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Asian-Pac-Islander,Black,Other,White,Male
0,39,77516,13,2174,0,40,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,38,215646,9,0,0,40,0,0,0,1,...,1,0,0,0,0,0,0,0,1,1
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0


In [65]:
classifier = XGBClassifier()
classifier.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [52]:
train_set['native_country'].unique()
train_set['native_country'].nunique()

42

In [53]:
test_set['native_country'].nunique()

41

In [66]:
y_pred = classifier.predict(X_test)

In [68]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm

array([[11681,   754],
       [ 1325,  2521]], dtype=int64)

In [69]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier , X = X_train , y = y_train , cv = 10)
accuracies.mean()

0.870519978138185