# Task for Today
***
## Using Data about Korean Person, Predict his/her income.


## 1. Setting Up

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv('../input/korea-income-and-welfare/Korea Income and Welfare.csv')

In [4]:
data

Unnamed: 0,id,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,10101,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,10101,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,10101,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,10101,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,10101,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92852,98000701,2014,10,5,11600.0,6,1,1967,5,1,1,874,1,
92853,98000701,2015,11,5,8327.0,6,1,1967,5,1,1,874,1,
92854,98000701,2016,12,5,7931.0,6,1,1967,5,1,1,874,1,
92855,98000701,2017,13,5,8802.0,5,1,1967,5,1,1,874,1,


## 2. Preprocessing

In [8]:
len(data.id.unique())

10046

In [11]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    
    return df

In [53]:
df = data.copy()

# df = df.replace(" ", np.nan)

df.isna().sum()
# This shows that we can replace whitespace values with 0's
# as that is the appropriate replacement value for absence of occupation, absence of company_size
# and absence of reason_none_worker.

id                    0
year                  0
wave                  0
region                0
income                0
family_member         0
gender                0
year_born             0
education_level       0
marriage              0
religion              0
occupation            0
company_size          0
reason_none_worker    0
dtype: int64

In [54]:
df[df['company_size'] == ' ']

Unnamed: 0,id,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,10101,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,10101,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,10101,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,10101,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,10101,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92843,97990701,2012,8,6,1591.0,2,1,1932,2,1,2,,,1
92845,97990701,2014,10,6,1612.0,2,1,1932,2,1,2,,,10
92846,97990701,2015,11,6,1899.0,2,1,1932,2,1,2,,,10
92847,97990701,2016,12,6,1770.3,2,1,1932,2,1,2,,,10


In [55]:
df.replace(' ', 0)
df[df['company_size'] == ' ']

Unnamed: 0,id,year,wave,region,income,family_member,gender,year_born,education_level,marriage,religion,occupation,company_size,reason_none_worker
0,10101,2005,1,1,614.0,1,2,1936,2,2,2,,,8
1,10101,2011,7,1,896.0,1,2,1936,2,2,2,,,10
2,10101,2012,8,1,1310.0,1,2,1936,2,2,2,,,10
3,10101,2013,9,1,2208.0,1,2,1936,2,2,2,,,1
4,10101,2014,10,1,864.0,1,2,1936,2,2,2,,,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92843,97990701,2012,8,6,1591.0,2,1,1932,2,1,2,,,1
92845,97990701,2014,10,6,1612.0,2,1,1932,2,1,2,,,10
92846,97990701,2015,11,6,1899.0,2,1,1932,2,1,2,,,10
92847,97990701,2016,12,6,1770.3,2,1,1932,2,1,2,,,10


In [60]:
def preprocessing_data(df):
    # Make copy of dataframe to not change the original
    df = df.copy()
    
    # Drop ID column for simplification
    df = df.drop('id', axis=1)

    # The only "N/A" values are whitespaces
    # Replace them with values of 0
    df = df.replace(" ", 0)
    
    # Make company_size from object into int type
    df.company_size = df.company_size.astype(int)
    
    # Create "Employed" Column
    df['employed'] = df.occupation != 0
    df['employed'] = df['employed'].astype(int)
    
    # Categorical columns and their prefixes
    nominal = [
        ('region', 'reg'),
        ('gender', 'gen'),
        ('marriage', 'marr'),
        ('occupation', 'occ'),
        ('reason_none_worker', 'non-work'),

    ]
    
    # One-Hot Encoding Categorical Features
    for column, prefix in nominal:
        df = onehot_encode(df, column, prefix)
    
    # Split data into input and label (X and y)
    y = df.income
    X = df.drop('income', axis=1)
    
    # Train and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    return X_train, X_test, y_train, y_test

In [61]:
X_train, X_test, y_train, y_test = preprocessing_data(data)

In [62]:
X_train

Unnamed: 0,year,wave,family_member,year_born,education_level,religion,company_size,employed,reg_1,reg_2,...,non-work_11,non-work_2,non-work_3,non-work_4,non-work_5,non-work_6,non-work_7,non-work_8,non-work_9,non-work_99
34107,2007,3,2,1973,5,1,10,1,0,0,...,0,0,0,0,0,0,0,0,0,0
13407,2006,2,4,1961,6,1,3,1,1,0,...,0,0,0,0,0,0,0,0,0,0
24021,2013,9,1,1928,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72605,2012,8,1,1931,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27945,2007,3,4,1965,6,2,10,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63206,2010,6,3,1968,5,1,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
61404,2007,3,2,1946,3,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17730,2013,9,1,1983,7,1,10,1,0,0,...,0,0,0,0,0,0,0,0,0,0
28030,2017,13,2,1953,4,2,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


## 3. Training

In [63]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [None]:
r_sq = model.score(X_test, y_test)
print("R^2 value for Lin Reg is {:.2f}".format(r_sq))