# 0.Set up

In [1]:
import numpy as np
import pandas as pd
import pickle

from src.config import HOLDOUT_PATH, HOLDOUT_PROCESSED, TARGET
from src.data_engineering import get_column_names

%load_ext autoreload
%autoreload 2

In [2]:
column_names = get_column_names()
raw_data = pd.read_csv(HOLDOUT_PATH, header=None, names=column_names)

In [3]:
## Double check the operation is correct (no of column_names matches the no of columns in raw_data)
len(column_names), raw_data.shape

(42, (99762, 42))

In [4]:
raw_data.head(5).T

Unnamed: 0,0,1,2,3,4
age,38,44,2,35,49
class_of_worker,Private,Self-employed-not incorporated,Not in universe,Private,Private
detailed_industry_recode,6,37,0,29,4
detailed_occupation_recode,36,12,0,3,34
education,1st 2nd 3rd or 4th grade,Associates degree-occup /vocational,Children,High school graduate,High school graduate
wage_per_hour,0,0,0,0,0
enroll_in_edu_inst_last_wk,Not in universe,Not in universe,Not in universe,Not in universe,Not in universe
marital_stat,Married-civilian spouse present,Married-civilian spouse present,Never married,Divorced,Divorced
major_industry_code,Manufacturing-durable goods,Business and repair services,Not in universe or children,Transportation,Construction
major_occupation_code,Machine operators assmblrs & inspctrs,Professional specialty,Not in universe,Executive admin and managerial,Precision production craft & repair


### Convert target to binary

In [5]:
data = raw_data.copy()

In [6]:
mapping = {
    ' - 50000.':0,
    ' 50000+.':1
}
data[TARGET] = data[TARGET].map(mapping)

### Convert encoded category to category type

In [7]:
encoded_cats = [
 'detailed_industry_recode',
 'detailed_occupation_recode',
 'own_business_or_self_employed',
 'veterans_benefits',
 'year']
for col in encoded_cats:
    data[col] = data[col].astype('category')

### Feature engineering

In [8]:
bins =  [-1] + [i*20 for i in range(6) if i != 0]
labels = [i for i in range(5)]
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels)
# data['age_group'] = data['age_group'].astype('int')

# 1. Data Overview

In [9]:
data.shape

(99762, 43)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99762 entries, 0 to 99761
Data columns (total 43 columns):
 #   Column                                      Non-Null Count  Dtype   
---  ------                                      --------------  -----   
 0   age                                         99762 non-null  int64   
 1   class_of_worker                             99762 non-null  object  
 2   detailed_industry_recode                    99762 non-null  category
 3   detailed_occupation_recode                  99762 non-null  category
 4   education                                   99762 non-null  object  
 5   wage_per_hour                               99762 non-null  int64   
 6   enroll_in_edu_inst_last_wk                  99762 non-null  object  
 7   marital_stat                                99762 non-null  object  
 8   major_industry_code                         99762 non-null  object  
 9   major_occupation_code                       99762 non-null  object  
 10

In [11]:
# Check missing value - None
data.isna().sum()

age                                           0
class_of_worker                               0
detailed_industry_recode                      0
detailed_occupation_recode                    0
education                                     0
wage_per_hour                                 0
enroll_in_edu_inst_last_wk                    0
marital_stat                                  0
major_industry_code                           0
major_occupation_code                         0
race                                          0
hispanic_origin                               0
sex                                           0
member_of_a_labor_union                       0
reason_for_unemployment                       0
full_or_part_time_employment_stat             0
capital_gains                                 0
capital_losses                                0
dividends_from_stocks                         0
tax_filer_stat                                0
region_of_previous_residence            

In [12]:
# Drop duplicates
data = data.drop_duplicates()
data.shape

(98879, 43)

In [13]:
# Check class imbalance - very imbalanced -> Will need to subsample data + adjsut for class weight for model training
data[TARGET].value_counts(normalize=True)

income_level
0    0.937439
1    0.062561
Name: proportion, dtype: float64

### Save to disk

In [14]:
with open(HOLDOUT_PROCESSED, 'wb') as f:
    pickle.dump(data, f)