# Census Income Data Set

## Extraction was done by Barry Becker from the 1994 Census database.

https://archive.ics.uci.edu/ml/datasets/census+income

![image.png](../Images/Census.png)

해당 데이터는 1994년 Census bureau database에서 추출된 것이다. 원본 데이터는 UCI 머신러닝 레포지토리에서 확인할 수 있으며,<br> 일부 변수를 제거하는 등 이를 실습용으로 수정한 데이터이다. 데이터는 사람들의 나이, 교육 수준, 결혼 여부 등의 특성과 연소득이 5만 달러를 넘는지 안 넘는지에 대한 정보를 담고 있다.<br>

주어진 학습용 데이터(Census_X_train.csv, Census_X_test.csv)를 활용하여 해당 사람이 연소득 5만달러를 넘는지 안넘는지 예측 모형을 만든 후,<br> 이를 평가용 데이터(Census_X_test.csv)에 적용하여 얻은 연소득 5만 달러 초과 여부(초과인 경우를 1로한다)를 .csv 파일로 저장한다.

### Library & Data Import

In [48]:
import pandas as pd
import numpy as np

In [49]:
X_test = pd.read_csv('../Datasets/Census_X_test.csv')
X_train = pd.read_csv('../Datasets/Census_X_train.csv')
y_train = pd.read_csv('../Datasets/Census_y_train.csv')

### 1. 데이터 탐색

In [50]:
X_test

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,34,Private,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States
15056,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
15057,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
15058,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [51]:
X_train

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...
30157,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
30158,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
30159,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
30160,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [52]:
y_train

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
30157,0
30158,1
30159,0
30160,0


In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   education_num   30162 non-null  int64 
 3   marital_status  30162 non-null  object
 4   occupation      30162 non-null  object
 5   relationship    30162 non-null  object
 6   race            30162 non-null  object
 7   sex             30162 non-null  object
 8   capital_gain    30162 non-null  int64 
 9   capital_loss    30162 non-null  int64 
 10  hours_per_week  30162 non-null  int64 
 11  native_country  30162 non-null  object
dtypes: int64(5), object(7)
memory usage: 2.8+ MB


In [54]:
X_train.describe()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
count,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,10.121312,1092.007858,88.372489,40.931238
std,13.134665,2.549995,7406.346497,404.29837,11.979984
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [55]:
print(X_train['capital_gain'].quantile([q/20 for q in range(15, 21)]))

0.75        0.0
0.80        0.0
0.85        0.0
0.90        0.0
0.95     5013.0
1.00    99999.0
Name: capital_gain, dtype: float64


In [56]:
print(X_train['capital_loss'].quantile([q/20 for q in range(15, 21)]))

0.75       0.0
0.80       0.0
0.85       0.0
0.90       0.0
0.95       0.0
1.00    4356.0
Name: capital_loss, dtype: float64


In [57]:
X_train['capital_gain_yn'] = np.where(X_train['capital_gain']>0, 1, 0)
X_train['capital_loss_yn'] = np.where(X_train['capital_loss']>0, 1, 0)

X_test['capital_gain_yn'] = np.where(X_test['capital_gain']>0, 1, 0)
X_test['capital_loss_yn'] = np.where(X_test['capital_loss']>0, 1, 0)

In [58]:
COL_DEL = []
COL_NUM = ['age', 'education_num', 'hours_per_week', 'capital_gain', 'capital_loss']
COL_CAT = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'capital_gain_yn', 'capital_loss_yn']
COL_Y = ['target']

X_train = X_train.drop(COL_DEL, axis=1)
X_test = X_test.drop(COL_DEL, axis=1)

In [59]:
train_df = pd.concat([X_train, y_train], axis=1)

for _col in COL_NUM:
    print('-'*80)
    print(_col)
    print(train_df.groupby(COL_Y)[_col].describe(), end='\n\n')

--------------------------------------------------------------------------------
age
          count      mean        std   min   25%   50%   75%   max
target                                                            
0       22654.0  36.60806  13.464631  17.0  26.0  34.0  45.0  90.0
1        7508.0  43.95911  10.269633  19.0  36.0  43.0  51.0  90.0

--------------------------------------------------------------------------------
education_num
          count       mean       std  min   25%   50%   75%   max
target                                                           
0       22654.0   9.629116  2.413596  1.0   9.0   9.0  10.0  16.0
1        7508.0  11.606420  2.368423  2.0  10.0  12.0  13.0  16.0

--------------------------------------------------------------------------------
hours_per_week
          count       mean        std  min   25%   50%   75%   max
target                                                            
0       22654.0  39.348592  11.950774  1.0  38.0  40.0  

In [60]:
for _col in COL_CAT:
    print(train_df.groupby(_col, as_index=False)[COL_Y].mean().sort_values(by=COL_Y, ascending=False), end='\n\n')

          workclass    target
3      Self-emp-inc  0.558659
0       Federal-gov  0.387063
1         Local-gov  0.294630
4  Self-emp-not-inc  0.285714
5         State-gov  0.268960
2           Private  0.218792
6       Without-pay  0.000000

          marital_status    target
1      Married-AF-spouse  0.476190
2     Married-civ-spouse  0.454959
0               Divorced  0.107262
6                Widowed  0.096735
3  Married-spouse-absent  0.083784
5              Separated  0.070288
4          Never-married  0.048324

           occupation    target
3     Exec-managerial  0.485220
9      Prof-specialty  0.448489
10    Protective-serv  0.326087
12       Tech-support  0.304825
11              Sales  0.270647
2        Craft-repair  0.225310
13   Transport-moving  0.202926
0        Adm-clerical  0.133835
6   Machine-op-inspct  0.124619
4     Farming-fishing  0.116279
1        Armed-Forces  0.111111
5   Handlers-cleaners  0.061481
7       Other-service  0.041096
8     Priv-house-serv  0.00699

In [61]:
COL_CAT

['workclass',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'capital_gain_yn',
 'capital_loss_yn']

In [62]:
from sklearn.preprocessing import LabelEncoder

X = pd.concat([X_train, X_test])

for _col in COL_CAT:
    le = LabelEncoder()
    le.fit(X_train[_col])
    X_train[_col] = le.transform(X_train[_col])
    X_test[_col] = le.transform(X_test[_col])

In [63]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train)

In [64]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_tr[COL_NUM] = scaler.fit_transform(X_tr[COL_NUM])
X_val[COL_NUM] = scaler.transform(X_val[COL_NUM])
X_test[COL_NUM] = scaler.transform(X_test[COL_NUM])

In [65]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_tr, y_tr.values.ravel())

RandomForestClassifier()

In [66]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [67]:
from xgboost import XGBClassifier

model_xgb1 = XGBClassifier()
model_xgb1.fit(X_tr, y_tr.values.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [68]:
model_xgb2 = XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=10)
model_xgb2.fit(X_tr, y_tr.values.ravel(), early_stopping_rounds=50, eval_metric='auc', eval_set=[(X_val, y_val)], verbose=10)

[0]	validation_0-auc:0.90765
[10]	validation_0-auc:0.91531




[20]	validation_0-auc:0.92022
[30]	validation_0-auc:0.92280
[40]	validation_0-auc:0.92574
[50]	validation_0-auc:0.92740
[60]	validation_0-auc:0.92821
[70]	validation_0-auc:0.92870
[80]	validation_0-auc:0.92922
[90]	validation_0-auc:0.92946
[100]	validation_0-auc:0.92943
[110]	validation_0-auc:0.92953
[120]	validation_0-auc:0.92932
[130]	validation_0-auc:0.92909
[140]	validation_0-auc:0.92914
[141]	validation_0-auc:0.92909


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=10, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [69]:
from sklearn.metrics import roc_auc_score

y_pred_rf = model_rf.predict_proba(X_val)
y_pred_xgb1 = model_xgb1.predict_proba(X_val)

score_rf = roc_auc_score(y_val, y_pred_rf[:, 1])
score_xgb1 = roc_auc_score(y_val, y_pred_xgb1[:, 1]) 

print(score_rf)
print(score_xgb1)

0.8991093461199448
0.9289827151828294


In [70]:
print(pd.DataFrame({'feature': X_tr.columns, 'fi_rf': model_rf.feature_importances_, 'fi_xgb': model_xgb1.feature_importances_}))

            feature     fi_rf    fi_xgb
0               age  0.216220  0.034014
1         workclass  0.048988  0.024655
2     education_num  0.141609  0.112879
3    marital_status  0.077191  0.097532
4        occupation  0.088139  0.031910
5      relationship  0.099087  0.387727
6              race  0.017590  0.015562
7               sex  0.015010  0.031678
8      capital_gain  0.105869  0.152726
9      capital_loss  0.033607  0.068764
10   hours_per_week  0.108797  0.027661
11   native_country  0.018866  0.014892
12  capital_gain_yn  0.022234  0.000000
13  capital_loss_yn  0.006793  0.000000


In [71]:
COL_DEL = ['capital_gain_yn', 'capital_loss_yn']

X_tr = X_tr.drop(COL_DEL, axis=1)
X_val = X_val.drop(COL_DEL, axis=1)
X_test = X_test.drop(COL_DEL, axis=1)

In [72]:
from sklearn.model_selection import GridSearchCV

grid_params = { 
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_cv = GridSearchCV(estimator=model_rf, param_grid=grid_params, cv=5)
rf_cv.fit(X_train, y_train.values.ravel()) 

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, 15],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 200]})

In [73]:
print(pd.DataFrame(rf_cv.cv_results_).head())

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.306160      0.031966         0.022739        0.001163   
1       0.625527      0.048541         0.045477        0.004441   
2       1.200184      0.054786         0.086789        0.007764   
3       0.279852      0.002708         0.022396        0.000778   
4       0.583248      0.019425         0.042079        0.001471   

  param_max_depth param_min_samples_leaf param_min_samples_split  \
0               5                      1                       2   
1               5                      1                       2   
2               5                      1                       2   
3               5                      1                       5   
4               5                      1                       5   

  param_n_estimators                                             params  \
0                 50  {'max_depth': 5, 'min_samples_leaf': 1, 'min_s...   
1                100  {'max_depth': 5,

In [74]:
print(rf_cv.best_params_)

{'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [75]:
model_rf2 = RandomForestClassifier(n_estimators=50
                                   , max_depth=15
                                   , min_samples_leaf=1
                                   , min_samples_split=5)
model_rf2.fit(X_tr, y_tr.values.ravel())

y_pred_rf2 = model_rf2.predict_proba(X_val)
score_rf2 = roc_auc_score(y_val, y_pred_rf2[:, 1])
print(score_rf2)

0.9198674135569683


In [76]:
grid_params = {'max_depth': [3, 5, 7, 10], 
               'min_child_weight': [1, 2], 
               'colsample_bytree': [0.6, 0.8],
               'subsample': [0.6, 0.8]}

xgb_cv = GridSearchCV(estimator=model_xgb1, param_grid=grid_params, cv=5)
xgb_cv.fit(X_tr, y_tr.values.ravel())

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012, max_bin=256,
                                     max_cat_threshold=64, max_cat_to_onehot=4,
                                     max_delta_step=0, max_depth=6,
                                     max_leaves=0, min_child_weight=1,
                                     missing=nan, mono

In [77]:
print(xgb_cv.best_params_)

{'colsample_bytree': 0.8, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8}


In [78]:
params = {'colsample_bytree': 0.6,
          'max_depth': 7,
          'min_child_weight': 1,
          'subsample': 0.8}

model_xgb3 = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_xgb3.set_params(**params)

model_xgb3.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='auc', eval_set=[(X_val, y_val)], verbose=10)

print(model_xgb3.best_score)

[0]	validation_0-auc:0.86840
[10]	validation_0-auc:0.91792




[20]	validation_0-auc:0.91997
[30]	validation_0-auc:0.92101
[40]	validation_0-auc:0.92198
[50]	validation_0-auc:0.92271
[60]	validation_0-auc:0.92359
[70]	validation_0-auc:0.92440
[80]	validation_0-auc:0.92613
[90]	validation_0-auc:0.92728
[100]	validation_0-auc:0.92790
[110]	validation_0-auc:0.92854
[120]	validation_0-auc:0.92920
[130]	validation_0-auc:0.92971
[140]	validation_0-auc:0.92987
[150]	validation_0-auc:0.93032
[160]	validation_0-auc:0.93061
[170]	validation_0-auc:0.93072
[180]	validation_0-auc:0.93088
[190]	validation_0-auc:0.93095
[200]	validation_0-auc:0.93102
[210]	validation_0-auc:0.93113
[220]	validation_0-auc:0.93115
[230]	validation_0-auc:0.93117
[240]	validation_0-auc:0.93122
[250]	validation_0-auc:0.93124
[260]	validation_0-auc:0.93126
[270]	validation_0-auc:0.93121
[280]	validation_0-auc:0.93124
[290]	validation_0-auc:0.93119
[293]	validation_0-auc:0.93116
0.9312953408292395


In [79]:
pred = model_xgb3.predict_proba(X_test)[:,1]
pd.DataFrame({'index': X_test.index, 'target': pred}).to_csv('./result.csv', index=False)