In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pl
import os

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

# set rand seed
np.random.seed(1)

In [65]:
def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 

In [66]:
# import the adult censur dataset
dataset_dir = r'F:\Google Drive\umich\eecs545_machine_learning\final_project'
df = pd.read_csv(os.path.join(dataset_dir, 'adult.csv'))
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [67]:
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 14), (6513, 14), (26048, 2), (6513, 2))

In [72]:
y_train

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K
...,...,...
2669,2669,>50K
17536,17536,<=50K
6201,6201,<=50K
27989,27989,<=50K


In [74]:
X_train.info()
X_train.isnull().sum()
print(X_train['workclass'].unique())
print(X_train['occupation'].unique())
print(X_train['native.country'].unique())

# check the number of values in each category
print(X_train['workclass'].value_counts())
print(X_test['workclass'].value_counts())

print(X_train['occupation'].value_counts())
print(X_test['occupation'].value_counts())

print(X_train['native.country'].value_counts())
print(X_test['native.country'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             26048 non-null  int64 
 1   workclass       24592 non-null  object
 2   fnlwgt          26048 non-null  int64 
 3   education       26048 non-null  object
 4   education.num   26048 non-null  int64 
 5   marital.status  26048 non-null  object
 6   occupation      24585 non-null  object
 7   relationship    26048 non-null  object
 8   race            26048 non-null  object
 9   sex             26048 non-null  object
 10  capital.gain    26048 non-null  int64 
 11  capital.loss    26048 non-null  int64 
 12  hours.per.week  26048 non-null  int64 
 13  native.country  25587 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.0+ MB
['Private' 'State-gov' 'Self-emp-not-inc' 'Self-emp-inc' 'Local-gov'
 'Federal-gov' nan 'Never-worked' 'Without-pay']
['Craft-repair'

In [75]:
# Since not a few numbers are missing, we cannot ignore them
# For the workclass and the native country, almost every value is concentrated to the most frequent category, so replace it by the mode of each category
# However, for occupation, let's treat the missing values as another category, 'unknown'

X_train['workclass'].fillna(X_train['workclass'].mode()[0], inplace=True)
X_train['native.country'].fillna(X_train['native.country'].mode()[0], inplace=True)
X_train['occupation'].fillna('unknown', inplace=True)

X_test['workclass'].fillna(X_test['workclass'].mode()[0], inplace=True)
X_test['native.country'].fillna(X_test['native.country'].mode()[0], inplace=True)
X_test['occupation'].fillna('unknown', inplace=True)

X_train.isnull().sum()


age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [76]:
# check if the education and education.num matches
def make_tuple(x):
    return (x['education'], x['education.num'])

edu = X_train[['education', 'education.num']].apply(make_tuple, axis=1)
print(edu.unique())

# Since education.num represents the education perfectly, drop the education feature
X_train.drop(['education'], axis=1, inplace=True)
X_test.drop(['education'], axis=1, inplace=True)

[('Bachelors', 13) ('Masters', 14) ('Some-college', 10) ('HS-grad', 9)
 ('10th', 6) ('Assoc-voc', 11) ('Assoc-acdm', 12) ('5th-6th', 3)
 ('7th-8th', 4) ('Doctorate', 16) ('Prof-school', 15) ('11th', 7)
 ('12th', 8) ('9th', 5) ('1st-4th', 2) ('Preschool', 1)]


In [79]:
# Training Set
cat_features = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for cat_col in cat_features:
    le = LabelEncoder()
    le.fit(X_train[cat_col])
    
    X_train[cat_col] = le.transform(X_train[cat_col])
    X_test[cat_col] = le.transform(X_test[cat_col])

X_train.head()

# Test Set - >50k: 1, <=50k: 0
onehot = OneHotEncoder()
onehot.fit(y_train[['income']])
y_train['income'] = onehot.transform(y_train[['income']]).toarray()[:, 1]
y_test['income'] = onehot.transform(y_test[['income']]).toarray()[:, 1]
X_train.describe()

num_features = ['age', 'fnlwgt', 'capital.gain', 'capital.loss', 'hours.per.week']

# check skewness
for num_col in num_features:
    print(num_col, 'skewness: %.4f' % (X_train[num_col].skew()))

# Transform every variable to logarithmic scale
for num_col in num_features[:-1]:
    if 0 in list(X_train[num_col]):
        scaled = np.log1p(X_train[num_col])
    else:
        scaled = np.log(X_train[num_col])
    
    print(num_col, 'skewness: %.4f' % (scaled.skew()))

# Transform
for num_col in num_features[:-1]:
    if 0 in list(X_train[num_col]):
        X_train[num_col] = np.log1p(X_train[num_col])
    else:
        X_train[num_col] = np.log(X_train[num_col])
        
    if 0 in list(X_test[num_col]):
        X_test[num_col] = np.log1p(X_test[num_col])
    else:
        X_test[num_col] = np.log(X_test[num_col])

X_train.head(3)

# And standardize the numerical features
for num_col in num_features:
    std = StandardScaler()
    std.fit(X_train[[num_col]])
    X_train[num_col] = std.transform(X_train[[num_col]]).flatten()
    X_test[num_col] = std.transform(X_test[[num_col]]).flatten()

X_train.head(3)

age skewness: 0.5541
fnlwgt skewness: 1.3462
capital.gain skewness: 11.9321
capital.loss skewness: 4.5455
hours.per.week skewness: 0.2383
age skewness: -0.1345
fnlwgt skewness: -0.8538
capital.gain skewness: 3.0846
capital.loss skewness: 4.2672


Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,-0.018736,3,0.65654,13,2,2,0,4,1,-0.300212,-0.222963,0.77541,38
7632,1.055717,3,-0.689793,14,0,9,1,4,0,-0.300212,-0.222963,-0.034014,38
27878,-1.794096,3,0.377046,10,4,12,1,4,0,-0.300212,-0.222963,-1.248149,38


In [82]:
# old, use new preprocessing

# # preprocess data

# # remove incomplete samples
# df[df=='?']=np.nan  # replace ? with nan
# n_rows = len(df)
# df=df.dropna(axis=0) # drop any rows with nan values
# n_dropped = n_rows - len(df)
# print('dropped {} rows out of {}'.format(n_dropped, n_rows))

# # set classification column to binary
# df['income'].replace({'<=50K':0,'>50K':1},inplace=True)

# df = df.drop('education.num',axis=1) # remove education num column, since redundant
# df = df.drop('fnlwgt', axis=1) # shown to have negative correlation 
# df.shape

# # encode categorical features with label encoding, not one-hot encoding
# cat_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
# for feature in cat_features:
#     labelencoder = LabelEncoder()
#     df[feature] = labelencoder.fit_transform(df[feature])
# df

# randomly generate train and test sets


# x_df = df.drop(['income'], axis=1)
# y_df = df['income']

# data_train, data_test, labels_train, labels_test = train_test_split(x_df.values, y_df.values, test_size=0.2, random_state=1)

In [91]:
# predict with xgboost

model = xgboost2.XGBoostClassifier()
start = time.time()
model.fit(X_train.values, y_train['income'].values, min_num_leaf=5, boosting_rounds=5, max_depth=10, lr=0.8, reg=1.2)
end = time.time()
print('training time: {} min'.format((end-start)/60) )

boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
training time: 47.77593679428101 min


In [93]:
pred = model.predict(X_test.values)
acc = np.sum(pred == y_test['income'].values)/len(pred)
print('accuracy = {}'.format(acc))

f1_score = sklearn.metrics.f1_score(y_test['income'].values, pred)
print('f1_score: {}'.format(f1_score))

accuracy = 0.820205742361431
f1_score: 0.6871493454448304
