## üè† Household Income Prediction

Given *data about households in Korea*, let's try to predict the **income** of a given household.

We will use various regression models to make our predictions.

Data source: https://www.kaggle.com/datasets/hongsean/korea-income-and-welfare

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

In [17]:
data = pd.read_csv('archive/Korea Income and Welfare.csv')
data

Unnamed: 0,id,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,10101,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,10101,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,10101,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,10101,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,10101,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,98000701,2014,10,5,11600.0,6,1,1967,5,1,1,874,1,
92853,98000701,2015,11,5,8327.0,6,1,1967,5,1,1,874,1,
92854,98000701,2016,12,5,7931.0,6,1,1967,5,1,1,874,1,
92855,98000701,2017,13,5,8802.0,5,1,1967,5,1,1,874,1,


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92857 entries, 0 to 92856
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  92857 non-null  int64  
 1   year                92857 non-null  int64  
 2   wave                92857 non-null  int64  
 3   region              92857 non-null  int64  
 4   income              92857 non-null  float64
 5   family_member       92857 non-null  int64  
 6   gender              92857 non-null  int64  
 7   year_born           92857 non-null  int64  
 8   education_level     92857 non-null  int64  
 9   marriage            92857 non-null  int64  
 10  religion            92857 non-null  int64  
 11  occupation          92857 non-null  object 
 12  company_size        92857 non-null  object 
 13  reason_none_worker  92857 non-null  object 
dtypes: float64(1), int64(10), object(3)
memory usage: 9.9+ MB


### Preprocessing

In [19]:
df = data.copy()

In [20]:
# Drop id column
df = df.drop('id', axis=1)

In [21]:
df

Unnamed: 0,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,2014,10,5,11600.0,6,1,1967,5,1,1,874,1,
92853,2015,11,5,8327.0,6,1,1967,5,1,1,874,1,
92854,2016,12,5,7931.0,6,1,1967,5,1,1,874,1,
92855,2017,13,5,8802.0,5,1,1967,5,1,1,874,1,


In [22]:
# Encode missing values properly
df = df.replace(' ', np.nan)

In [23]:
df

Unnamed: 0,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,2014,10,5,11600.0,6,1,1967,5,1,1,874,1,
92853,2015,11,5,8327.0,6,1,1967,5,1,1,874,1,
92854,2016,12,5,7931.0,6,1,1967,5,1,1,874,1,
92855,2017,13,5,8802.0,5,1,1967,5,1,1,874,1,


In [24]:
df.isna().sum()

year                      0
wave                      0
region                    0
income                    0
family_member             0
gender                    0
year_born                 0
education_level           0
marriage                  0
religion                  0
occupation            33643
company_size          33642
reason_none_worker    60710
dtype: int64

In [25]:
{column: df[column].unique() for column in df.select_dtypes('object').columns}

{'occupation': array([nan, '421', '411', '951', '513', '521', '942', '832', '862', '762',
        '415', '511', '512', '530', '855', '911', '999', '941', '713',
        '730', '311', '312', '245', '286', '151', '182', '281', '823',
        '510', '441', '922', '416', '432', '773', '783', '930', '721',
        '910', '531', '733', '913', '274', '873', '422', '915', '157',
        '842', '30', '315', '313', '399', '875', '772', '815', '316',
        '852', '113', '21', '991', '232', '314', '284', '912', '712',
        '753', '821', '412', '921', '139', '220', '761', '752', '851',
        '132', '780', '613', '612', '754', '742', '863', '744', '799',
        '899', '431', '617', '792', '153', '252', '234', '843', '992',
        '413', '442', '317', '254', '222', '224', '223', '323', '914',
        '259', '771', '141', '833', '135', '159', '953', '831', '841',
        '741', '285', '822', '322', '149', '751', '251', '235', '722',
        '246', '24', '320', '231', '731', '152', '171', '291

In [26]:
{column: len(df[column].unique()) for column in df.select_dtypes('object').columns}

{'occupation': 244, 'company_size': 13, 'reason_none_worker': 14}

In [27]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix, dtype=int)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [28]:
# One-hot encode categorical variables
nominal_features = [
    ('region', 'reg'),
    ('marriage', 'mar'),
    ('religion', 'rel'),
    ('occupation', 'occ'),
    ('reason_none_worker', 'rsn')
]

for column, prefix in nominal_features:
    df = onehot_encode(df, column=column, prefix=prefix)

In [29]:
df

Unnamed: 0,year,wave,income,family_member,gender,year_born,education_level,company_size,reg_1,reg_2,...,rsn_11,rsn_2,rsn_3,rsn_4,rsn_5,rsn_6,rsn_7,rsn_8,rsn_9,rsn_99
0,2005,1,614.0,1,2,1936,2,,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2011,7,896.0,1,2,1936,2,,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2012,8,1310.0,1,2,1936,2,,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,9,2208.0,1,2,1936,2,,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,10,864.0,1,2,1936,2,,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,2014,10,11600.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92853,2015,11,8327.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92854,2016,12,7931.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92855,2017,13,8802.0,5,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
df.loc[:, df.isna().sum() > 0]

Unnamed: 0,company_size
0,
1,
2,
3,
4,
...,...
92852,1
92853,1
92854,1
92855,1


In [34]:
# Fill company size missing values with 0
df['company_size'] = df['company_size'].fillna(0)

In [35]:
df

Unnamed: 0,year,wave,income,family_member,gender,year_born,education_level,company_size,reg_1,reg_2,...,rsn_11,rsn_2,rsn_3,rsn_4,rsn_5,rsn_6,rsn_7,rsn_8,rsn_9,rsn_99
0,2005,1,614.0,1,2,1936,2,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2011,7,896.0,1,2,1936,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2012,8,1310.0,1,2,1936,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,9,2208.0,1,2,1936,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,10,864.0,1,2,1936,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,2014,10,11600.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92853,2015,11,8327.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92854,2016,12,7931.0,6,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
92855,2017,13,8802.0,5,1,1967,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df.isna().sum().sum()

np.int64(0)

In [37]:
# Split df into X and y
y = df['income']
X = df.drop('income', axis=1)

In [38]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [39]:
X_train

Unnamed: 0,year,wave,family_member,gender,year_born,education_level,company_size,reg_1,reg_2,reg_3,...,rsn_11,rsn_2,rsn_3,rsn_4,rsn_5,rsn_6,rsn_7,rsn_8,rsn_9,rsn_99
38023,2015,11,2,1,1952,4,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
36465,2012,8,4,1,1961,5,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
29761,2016,12,3,1,1976,7,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52414,2015,11,2,1,1955,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55985,2016,12,1,2,1936,2,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21440,2009,5,2,1,1934,2,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
73349,2009,5,4,1,1969,8,10,0,0,1,...,0,0,0,0,0,0,0,0,0,0
50057,2007,3,2,1,1932,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,2018,14,1,2,1930,7,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
y_train

38023    2958.0
36465    7447.0
29761    8113.0
52414    3434.0
55985     564.0
          ...  
21440    2420.0
73349    6132.0
50057    2830.0
5192      540.0
77708    2469.0
Name: income, Length: 64999, dtype: float64

In [41]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [42]:
X_train

Unnamed: 0,year,wave,family_member,gender,year_born,education_level,company_size,reg_1,reg_2,reg_3,...,rsn_11,rsn_2,rsn_3,rsn_4,rsn_5,rsn_6,rsn_7,rsn_8,rsn_9,rsn_99
0,0.872427,0.872427,-0.373172,-0.651209,-0.056228,-0.303502,-0.374223,-0.428721,1.940264,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
1,0.124355,0.124355,1.171539,-0.651209,0.506712,0.295385,-0.374223,-0.428721,1.940264,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
2,1.121785,1.121785,0.399183,-0.651209,1.444945,1.493158,0.574542,-0.428721,-0.515394,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
3,0.872427,0.872427,-0.373172,-0.651209,0.131418,0.295385,-0.374223,-0.428721,-0.515394,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
4,1.121785,1.121785,-1.145528,1.535604,-1.057011,-1.501275,-0.057968,-0.428721,-0.515394,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64994,-0.623718,-0.623718,-0.373172,-0.651209,-1.182109,-1.501275,-0.690478,-0.428721,1.940264,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,4.579735,-0.173479,-0.0269
64995,-0.623718,-0.623718,1.171539,-0.651209,1.007103,2.092044,2.472072,-0.428721,-0.515394,2.187792,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
64996,-1.122433,-1.122433,-0.373172,-0.651209,-1.307207,0.295385,-0.374223,-0.428721,-0.515394,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269
64997,1.620500,1.620500,-1.145528,1.535604,-1.432305,1.493158,-0.690478,2.332519,-0.515394,-0.457082,...,-0.049206,-0.009608,-0.024815,-0.086975,-0.0592,-0.108697,-0.112545,-0.218353,-0.173479,-0.0269


### Training

In [43]:
models = {
    '                    LinearRegression': LinearRegression(),
    '    L2-Regularized Linear Regression': Ridge(),
    '   L-1 Regularized Linear Regression': Lasso(),
    '           Huber (Robust) Regression': HuberRegressor(),
    'Linear Kernel Support Vector Machine': LinearSVR(),
    '                       Decision Tree': DecisionTreeRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + ' trained.')

                    LinearRegression trained.
    L2-Regularized Linear Regression trained.
   L-1 Regularized Linear Regression trained.
           Huber (Robust) Regression trained.
Linear Kernel Support Vector Machine trained.
                       Decision Tree trained.


### Results

In [44]:
print("Model R^2 Scores (Test Set):")
for name, model in models.items():
    print(name + ": {:.4f}".format(model.score(X_test, y_test)))

Model R^2 Scores (Test Set):
                    LinearRegression: 0.2986
    L2-Regularized Linear Regression: 0.2986
   L-1 Regularized Linear Regression: 0.2985
           Huber (Robust) Regression: 0.2841
Linear Kernel Support Vector Machine: 0.2694
                       Decision Tree: -0.0336


#### Optimizing Regularization Strength of L2 and L1 Regression Models

In [47]:
l2_reg_strength = 1.0

l2_model = Ridge(alpha=l2_reg_strength)
l2_model.fit(X_train, y_train)

print("Ridge Regression Test R^2 (alpha={}): {:.5f}".format(l2_reg_strength, l2_model.score(X_test, y_test)))

Ridge Regression Test R^2 (alpha=1.0): 0.29861


In [52]:
l1_reg_strength = 0.1

l1_model = Lasso(alpha=l1_reg_strength)
l1_model.fit(X_train, y_train)

print("Lasso Regression Test R^2 (alpha={}): {:.5f}".format(l1_reg_strength, l1_model.score(X_test, y_test)))

Lasso Regression Test R^2 (alpha=0.1): 0.29859


  model = cd_fast.enet_coordinate_descent(
