In [52]:
import pandas as pd
import numpy as np
import os
import xgboost2
import time
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None  # default='warn'

In [53]:
# import the adult censur dataset
dataset_dir = r'F:\Google Drive\umich\eecs545_machine_learning\final_project'
df = pd.read_csv(os.path.join(dataset_dir, 'adult.csv'))
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [54]:
# preprocess data

# remove incomplete samples
df[df=='?']=np.nan  # replace ? with nan
n_rows = len(df)
df=df.dropna(axis=0) # drop any rows with nan values
n_dropped = n_rows - len(df)
print('dropped {} rows out of {}'.format(n_dropped, n_rows))

# set classification column to binary
df['income'].replace({'<=50K':0,'>50K':1},inplace=True)

df = df.drop('education.num',axis=1) # remove education num column, since redundant
df = df.drop('fnlwgt', axis=1) # shown to have negative correlation 
df.shape

# encode categorical features with label encoding, not one-hot encoding
cat_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for feature in cat_features:
    labelencoder = LabelEncoder()
    df[feature] = labelencoder.fit_transform(df[feature])
df

dropped 2399 rows out of 32561


Unnamed: 0,age,workclass,education,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,2,11,6,3,1,4,0,0,4356,18,38,0
3,54,2,5,0,6,4,4,0,0,3900,40,38,0
4,41,2,15,5,9,3,4,0,0,3900,40,38,0
5,34,2,11,0,7,4,4,0,0,3770,45,38,0
6,38,2,0,5,0,4,4,1,0,3770,40,38,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,2,15,4,10,1,4,1,0,0,40,38,0
32557,27,2,7,2,12,5,4,0,0,0,38,38,0
32558,40,2,11,2,6,0,4,1,0,0,40,38,1
32559,58,2,11,6,0,4,4,0,0,0,40,38,0


In [55]:
# randomly generate train and test sets


x_df = df.drop(['income'], axis=1)
y_df = df['income']

data_train, data_test, labels_train, labels_test = train_test_split(x_df.values, y_df.values, test_size=0.2, random_state=1)

In [58]:
# predict with xgboost

model = xgboost2.XGBoostClassifier()
start = time.time()
model.fit(data_train, labels_train, min_num_leaf=5, boosting_rounds=5, max_depth=10, lr=0.8, reg=1.2)
end = time.time()
print('training time: {}'.format(end-start))

boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
training time: 2672.860722541809


In [59]:
3672/60

61.2

In [60]:
pred = model.predict(data_test)
acc = np.sum(pred == labels_test)/len(pred)
print('accuracy = {}'.format(acc))

f1_score = sklearn.metrics.f1_score(labels_test, pred)
print('f1_score: {}'.format(f1_score))

accuracy = 0.819327034642798
f1_score: 0.6978935698447895
