# 0.Set up

In [1]:
import numpy as np
import pandas as pd
import pickle

from src.config import RAWDATA_PATH, TARGET, DATA_PATH, SUBSAMPLE_PATH
from src.data_engineering import get_column_names

%load_ext autoreload
%autoreload 2

In [3]:
column_names = get_column_names()
raw_data = pd.read_csv(RAWDATA_PATH, header=None, names=column_names)

In [4]:
## Double check the operation is correct (no of column_names matches the no of columns in raw_data)
len(column_names), raw_data.shape

(42, (199523, 42))

In [5]:
raw_data.head(5).T

Unnamed: 0,0,1,2,3,4
age,73,58,18,9,10
class_of_worker,Not in universe,Self-employed-not incorporated,Not in universe,Not in universe,Not in universe
detailed_industry_recode,0,4,0,0,0
detailed_occupation_recode,0,34,0,0,0
education,High school graduate,Some college but no degree,10th grade,Children,Children
wage_per_hour,0,0,0,0,0
enroll_in_edu_inst_last_wk,Not in universe,Not in universe,High school,Not in universe,Not in universe
marital_stat,Widowed,Divorced,Never married,Never married,Never married
major_industry_code,Not in universe or children,Construction,Not in universe or children,Not in universe or children,Not in universe or children
major_occupation_code,Not in universe,Precision production craft & repair,Not in universe,Not in universe,Not in universe


### Convert target to binary

In [6]:
data = raw_data.copy()

In [7]:
mapping = {
    ' - 50000.':0,
    ' 50000+.':1
}
data[TARGET] = data[TARGET].map(mapping)

### Convert encoded category to category type

In [8]:
encoded_cats = [
 'detailed_industry_recode',
 'detailed_occupation_recode',
 'own_business_or_self_employed',
 'veterans_benefits',
 'year']
for col in encoded_cats:
    data[col] = data[col].astype('category')

# 1. Data Overview

In [9]:
data.shape

(199523, 42)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 42 columns):
 #   Column                                      Non-Null Count   Dtype   
---  ------                                      --------------   -----   
 0   age                                         199523 non-null  int64   
 1   class_of_worker                             199523 non-null  object  
 2   detailed_industry_recode                    199523 non-null  category
 3   detailed_occupation_recode                  199523 non-null  category
 4   education                                   199523 non-null  object  
 5   wage_per_hour                               199523 non-null  int64   
 6   enroll_in_edu_inst_last_wk                  199523 non-null  object  
 7   marital_stat                                199523 non-null  object  
 8   major_industry_code                         199523 non-null  object  
 9   major_occupation_code                       199523 non-null

In [11]:
# Check missing value - None
data.isna().sum()

age                                           0
class_of_worker                               0
detailed_industry_recode                      0
detailed_occupation_recode                    0
education                                     0
wage_per_hour                                 0
enroll_in_edu_inst_last_wk                    0
marital_stat                                  0
major_industry_code                           0
major_occupation_code                         0
race                                          0
hispanic_origin                               0
sex                                           0
member_of_a_labor_union                       0
reason_for_unemployment                       0
full_or_part_time_employment_stat             0
capital_gains                                 0
capital_losses                                0
dividends_from_stocks                         0
tax_filer_stat                                0
region_of_previous_residence            

In [12]:
# Drop duplicates
data = data.drop_duplicates()
data.shape

(196294, 42)

In [13]:
# Check class imbalance - very imbalanced -> Will need to subsample data + adjsut for class weight for model training
data[TARGET].value_counts(normalize=True)

income_level
0    0.936921
1    0.063079
Name: proportion, dtype: float64

### Save to disk

In [14]:
with open(DATA_PATH, 'wb') as f:
    pickle.dump(data, f)

# 2. Subsample data

In [15]:
data[TARGET].value_counts()

income_level
0    183912
1     12382
Name: count, dtype: int64

In [16]:
# Given the larget dataset size, it's fine to only keep 10% negative class
pos, neg = data.loc[data[TARGET]==1], data.loc[data[TARGET]==0]

subsample_data = pd.concat([
    pos,
    neg.sample(frac=0.1, random_state=42)
])

In [17]:
# Check class imbalance for subsampled dataset
subsample_data[TARGET].value_counts(normalize=True)

income_level
0    0.597634
1    0.402366
Name: proportion, dtype: float64

### Save to disk

In [18]:
with open(SUBSAMPLE_PATH, 'wb') as f:
    pickle.dump(subsample_data, f)