In [1]:
# import libs
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
# import data
columns = pd.read_csv('D:\MLExercise\MLExercise\census-income.columns', names=["column"], header=None)
col_name = []
for name in columns['column']:
    col_name.append(name)
income = pd.read_csv('D:\MLExercise\MLExercise\census-income.data', names=col_name, header=None)

In [3]:
income.head()

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [4]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 42 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   age                                         199523 non-null  int64  
 1   class of worker                             199523 non-null  object 
 2   detailed industry recode                    199523 non-null  int64  
 3   detailed occupation recode                  199523 non-null  int64  
 4   education                                   199523 non-null  object 
 5   wage per hour                               199523 non-null  int64  
 6   enroll in edu inst last wk                  199523 non-null  object 
 7   marital stat                                199523 non-null  object 
 8   major industry code                         199523 non-null  object 
 9   major occupation code                       199523 non-null  object 
 

In [5]:
# we can see that there are some object type data, so I need to convert these data later

In [6]:
# I noticed that there are many '?' in the data

In [7]:
# check number of '?'
income.apply(lambda x:np.sum(x=='?'))

age                                               0
class of worker                                   0
detailed industry recode                          0
detailed occupation recode                        0
education                                         0
wage per hour                                     0
enroll in edu inst last wk                        0
marital stat                                      0
major industry code                               0
major occupation code                             0
race                                              0
hispanic origin                                   0
sex                                               0
member of a labor union                           0
reason for unemployment                           0
full or part time employment stat                 0
capital gains                                     0
capital losses                                    0
dividends from stocks                             0
tax filer st

In [8]:
# delete columns that have too many '?', because it's useless to replace those '?' to some value
income.drop('migration code-change in msa',axis=1, inplace=True)
income.drop('migration code-change in reg',axis=1, inplace=True)
income.drop('migration code-move within reg',axis=1, inplace=True)
income.drop('migration prev res in sunbelt',axis=1, inplace=True)

In [9]:
# replace '?' with NA for further operation
income = income.replace(regex=[r'\?'],value=np.nan)

In [10]:
# replace NA with the most common values of each columns
values = {'hispanic origin': income['hispanic origin'].mode()[0], 'state of previous residence': income['state of previous residence'].mode()[0], 'country of birth father': income['country of birth father'].mode()[0], 'country of birth mother': income['country of birth mother'].mode()[0], 'country of birth self': income['country of birth self'].mode()[0]}

In [11]:
# convert object value to int
for feature in income.columns:
    if income[feature].dtype == 'object':
        income[feature] = pd.Categorical(income[feature]).codes

In [12]:
# check if the dataset is balance
income['label'].value_counts()

0    187141
1     12382
Name: label, dtype: int64

In [13]:
# this is not a balanced dataset, the ratio of the negative examples and positive examples is around 15:1

In [14]:
# split test data and train data
# stratify=income['label'] can make sure that the ratio of positive and negative examples in the test set and training set is consistent
Xtrain,Xtest,ytrain,ytest=train_test_split(income[income.columns[:37]],income[income.columns[37]],test_size=0.25,random_state=6, stratify=income['label'])
Xtest.drop('instance weight',axis=1, inplace=True)

In [15]:
# get the whole traing data for upsampling
whole_train = Xtrain
whole_train['label']=ytrain

In [16]:
# two common methods to deal with unbalanced data are upsampling and downsampling
# I use upsampling because of two reasons:
# 1. downsampling will lose a large amount of data and affect the accuracy of model prediction
# 2. we are more concerned about the prediction accuracy of the positive examples in this work

# do upsampling
def up_sample(df):
    df1=df[df['label']==1] # positive case
    df2=df[df['label']==0] # negative case
    df3=pd.concat([df1,df1,df1,df1,df1],ignore_index=True)
    return pd.concat([df2,df3],ignore_index=True)
new_income = up_sample(whole_train)
new_income['label'].value_counts()

0    140356
1     46430
Name: label, dtype: int64

In [17]:
# after upsampling, the ratio is around 3:1 which is much better, but I will modify it later to get the best performance

In [18]:
# split
X_train,X_test,y_train,y_test=train_test_split(new_income[new_income.columns[:37]],new_income[new_income.columns[37]],test_size=0.25,random_state=33, stratify=new_income['label'])

In [19]:
X_test.head()

Unnamed: 0,age,class of worker,detailed industry recode,detailed occupation recode,education,wage per hour,enroll in edu inst last wk,marital stat,major industry code,major occupation code,...,family members under 18,country of birth father,country of birth mother,country of birth self,citizenship,own business or self employed,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year,year
56639,57,4,44,32,12,800,2,0,20,7,...,4,39,39,39,4,0,1,2,52,94
125503,38,3,0,0,1,0,2,2,14,6,...,4,39,39,39,4,0,1,2,0,94
139620,25,4,33,16,12,0,2,2,19,12,...,4,39,39,39,4,0,1,2,52,94
122029,0,3,0,0,10,0,2,4,14,6,...,0,39,39,39,4,0,1,0,0,94
166161,38,4,41,2,14,4000,2,2,9,2,...,4,39,39,39,4,0,1,2,52,95


In [20]:
# get the instance weight for training
weights = []
for weight in X_train['instance weight']:
    weights.append(float(weight))

In [21]:
X_train.drop('instance weight',axis=1, inplace=True)

In [22]:
# setup model, here I use xgboost as the model
params = {
'booster': 'gbtree',
'objective': 'binary:hinge', # because this is a binary classification problem, the output should be 0 or 1
'max_depth':5, # common default value
'min_child_weight':1, # becasue the data is unbalance, so I set a small number here
'gamma':0,  # common default value
'subsample':1, # common default value
'colsample_bytree':1, # common default value
'scale_pos_weight':3 ,# after up sample, the ratio is around 3:1
}

In [23]:
# after testing, 357 is the best num_boost_round value
xgtrain=xgb.DMatrix(data=X_train.values,label=y_train,weight=weights)
xgtest = xgb.DMatrix(data=Xtest.values)
xgbM = xgb.train(params=params, dtrain=xgtrain, num_boost_round=357,evals=[(xgtrain, 'train')],verbose_eval=True)
xgbc_y_predict=xgbM.predict(xgtest)

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.74414
[1]	train-error:0.28271
[2]	train-error:0.18370
[3]	train-error:0.14121
[4]	train-error:0.13390
[5]	train-error:0.12528
[6]	train-error:0.12431
[7]	train-error:0.12407
[8]	train-error:0.12339
[9]	train-error:0.12182
[10]	train-error:0.12155
[11]	train-error:0.12125
[12]	train-error:0.12109
[13]	train-error:0.12101
[14]	train-error:0.12024
[15]	train-error:0.11976
[16]	train-error:0.11914
[17]	train-error:0.11871
[18]	train-error:0.11799
[19]	train-error:0.11748
[20]	train-error:0.11699
[21]	train-error:0.11637
[22]	train-error:0.11516
[23]	train-error:0.11456
[24]	train-error:0.11412
[25]	train-error:0.11386
[26]	train-error:0.11323
[27]	train-error:0.11286
[28]	train-error:0

In [24]:
print('Performance of XGBoost:')
print(classification_report(ytest.values,xgbc_y_predict, target_names=None))

Performance of XGBoost:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     46785
           1       0.45      0.71      0.55      3096

    accuracy                           0.93     49881
   macro avg       0.71      0.83      0.76     49881
weighted avg       0.95      0.93      0.94     49881



In [25]:
# we can see that the precision for 1 is not good even though the overall accuracy is good.
# but because the data is 1:15, even if the blind guess results are all negative, 
# the accuracy rate of 15/16=93.75% can be achieved.
# and we are more concerned about the prediction accuracy of the positive examples
# so I need to modify the upsampling function
# here we have a trade-off between recall and precision

In [26]:
def best_up_sample(df, n):
    df1=df[df['label']==1]
    df2=df[df['label']==0]
    multi_df=[]
    for i in range(1, n):
        multi_df.append(df1)
    df3=pd.concat(multi_df,ignore_index=True)
    return pd.concat([df2,df3],ignore_index=True)

In [27]:
# choose the best up sample value
from sklearn.metrics import precision_recall_curve
for i in range(2, 10):
    temp = income
    temp = best_up_sample(temp, i)
    # split
    X_train,X_test,y_train,y_test=train_test_split(temp[temp.columns[:37]],temp[temp.columns[37]],test_size=0.25,random_state=33, stratify=temp['label'])
    
    # get the instance weight
    weights = []
    for weight in X_train['instance weight']:
        weights.append(float(weight))
    X_train.drop('instance weight',axis=1, inplace=True)
    
    # setup model
    params = {
    'booster': 'gbtree',
    'objective': 'binary:hinge',
    'max_depth':5,
    'min_child_weight':1,
    'gamma':0,
    'subsample':1, 
    'colsample_bytree':1,
    'scale_pos_weight':(len(temp[temp['label'] == 0])//len(temp[temp['label'] == 1])) ,
    }

    xgtrain=xgb.DMatrix(data=X_train.values,label=y_train,weight=weights)
    xgtest = xgb.DMatrix(data=Xtest.values)
    xgbM = xgb.train(params=params, dtrain=xgtrain, num_boost_round=1000,evals=[(xgtrain, 'train')],verbose_eval=True)
    xgbc_y_predict=xgbM.predict(xgtest)
    print('Performance of XGBoost:')
    print(classification_report(ytest.values,xgbc_y_predict, target_names=None))

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.93588
[1]	train-error:0.14734
[2]	train-error:0.08494
[3]	train-error:0.07337
[4]	train-error:0.05202
[5]	train-error:0.05163
[6]	train-error:0.05063
[7]	train-error:0.05099
[8]	train-error:0.05028
[9]	train-error:0.05008
[10]	train-error:0.04962
[11]	train-error:0.04975
[12]	train-error:0.04959
[13]	train-error:0.04956
[14]	train-error:0.04947
[15]	train-error:0.04933
[16]	train-error:0.04917
[17]	train-error:0.04904
[18]	train-error:0.04880
[19]	train-error:0.04864
[20]	train-error:0.04841
[21]	train-error:0.04836
[22]	train-error:0.04835
[23]	train-error:0.04813
[24]	train-error:0.04810
[25]	train-error:0.04806
[26]	train-error:0.04802
[27]	train-error:0.04793
[28]	train-error:0

In [28]:
# after testing, best_up_sample(temp, 3) has the best performance and the corresponding best num_boost_round=188

In [29]:
# final performance
temp = income
temp = best_up_sample(temp, 3)
# split
X_train,X_test,y_train,y_test=train_test_split(temp[temp.columns[:37]],temp[temp.columns[37]],test_size=0.25,random_state=33, stratify=temp['label'])

# get the instance weight
weights = []
for weight in X_train['instance weight']:
    weights.append(float(weight))
X_train.drop('instance weight',axis=1, inplace=True)

# setup model
params = {
'booster': 'gbtree',
'objective': 'binary:hinge',
'max_depth':5,
'min_child_weight':1,
'gamma':0,
'subsample':1, 
'colsample_bytree':1,
'scale_pos_weight':(len(temp[temp['label'] == 0])//len(temp[temp['label'] == 1])) ,
}

xgtrain=xgb.DMatrix(data=X_train.values,label=y_train,weight=weights)
xgtest = xgb.DMatrix(data=Xtest.values)
xgbM = xgb.train(params=params, dtrain=xgtrain, num_boost_round=188,evals=[(xgtrain, 'train')],verbose_eval=True)
xgbc_y_predict=xgbM.predict(xgtest)

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.87948
[1]	train-error:0.24405
[2]	train-error:0.09602
[3]	train-error:0.09506
[4]	train-error:0.09408
[5]	train-error:0.09395
[6]	train-error:0.08912
[7]	train-error:0.08833
[8]	train-error:0.08822
[9]	train-error:0.08587
[10]	train-error:0.08577
[11]	train-error:0.08341
[12]	train-error:0.08220
[13]	train-error:0.08185
[14]	train-error:0.08073
[15]	train-error:0.08002
[16]	train-error:0.07953
[17]	train-error:0.07934
[18]	train-error:0.07883
[19]	train-error:0.07867
[20]	train-error:0.07830
[21]	train-error:0.07775
[22]	train-error:0.07744
[23]	train-error:0.07673
[24]	train-error:0.07640
[25]	train-error:0.07544
[26]	train-error:0.07480
[27]	train-error:0.07455
[28]	train-error:0

In [30]:
print('Performance of XGBoost:')
print(classification_report(ytest.values,xgbc_y_predict, target_names=None))

Performance of XGBoost:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     46785
           1       0.66      0.61      0.63      3096

    accuracy                           0.96     49881
   macro avg       0.82      0.79      0.81     49881
weighted avg       0.95      0.96      0.96     49881

