# Task for Today
***
## Using Data about Korean Person, Predict his/her income.


## 1. Setting Up

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

In [None]:
data = pd.read_csv('../input/korea-income-and-welfare/Korea Income and Welfare.csv')

In [None]:
data

## 2. Preprocessing

In [None]:
len(data.id.unique())

In [None]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    
    return df

In [None]:
df = data.copy()

# df = df.replace(" ", np.nan)

df.isna().sum()
# This shows that we can replace whitespace values with 0's
# as that is the appropriate replacement value for absence of occupation, absence of company_size
# and absence of reason_none_worker.

In [None]:
df[df['company_size'] == ' ']

In [None]:
df.replace(' ', 0)
df[df['company_size'] == ' ']

In [None]:
def preprocessing_data(df):
    # Make copy of dataframe to not change the original
    df = df.copy()
    
    # Drop ID column for simplification
    df = df.drop('id', axis=1)

    # The only "N/A" values are whitespaces
    # Replace them with values of 0
    df = df.replace(" ", 0)
    
    # Make company_size from object into int type
    df.company_size = df.company_size.astype(int)
    
    # Create "Employed" Column
    df['employed'] = df.occupation != 0
    df['employed'] = df['employed'].astype(int)
    
    # Categorical columns and their prefixes
    nominal = [
        ('region', 'reg'),
        ('gender', 'gen'),
        ('marriage', 'marr'),
        ('occupation', 'occ'),
        ('reason_none_worker', 'non-work'),

    ]
    
    # One-Hot Encoding Categorical Features
    for column, prefix in nominal:
        df = onehot_encode(df, column, prefix)
    
    # Split data into input and label (X and y)
    y = df.income
    X = df.drop('income', axis=1)
    
    # Train and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocessing_data(data)

In [None]:
X_train

## 3. Training

In [81]:
models = {
    "              LinearRegression":LinearRegression(),
    "L2 Regularized RidgeRegression":Ridge(),
    "L1 Regularized LassoRegression":Lasso(),
    "                HuberRegressor":HuberRegressor(),
    "                     LinearSVR":LinearSVR(),
    "         DecisionTreeRegressor":DecisionTreeRegressor(),
}

In [82]:
models

{'              LinearRegression': LinearRegression(),
 'L2 Regularized RidgeRegression': Ridge(),
 'L1 Regularized LassoRegression': Lasso(),
 '                HuberRegressor': HuberRegressor(),
 '                     LinearSVR': LinearSVR(),
 '         DecisionTreeRegressor': DecisionTreeRegressor()}

In [83]:
models.items()

dict_items([('              LinearRegression', LinearRegression()), ('L2 Regularized RidgeRegression', Ridge()), ('L1 Regularized LassoRegression', Lasso()), ('                HuberRegressor', HuberRegressor()), ('                     LinearSVR', LinearSVR()), ('         DecisionTreeRegressor', DecisionTreeRegressor())])

In [84]:
for name, model in models.items():
    model.fit(X_train,y_train)
    print("The {} model is trained".format(name) )

The               LinearRegression model is trained
The L2 Regularized RidgeRegression model is trained
The L1 Regularized LassoRegression model is trained


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


The                 HuberRegressor model is trained




The                      LinearSVR model is trained
The          DecisionTreeRegressor model is trained


## 4. Results

In [85]:
r_sq = model.score(X_test, y_test)
print("R^2 value for All Models: ")

for name, model in models.items():
    score = model.score(X_test, y_test)
    print(name, " : {:.5f}".format(score) )

R^2 value for All Models: 
              LinearRegression  : 0.22286
L2 Regularized RidgeRegression  : 0.22301
L1 Regularized LassoRegression  : 0.22154
                HuberRegressor  : 0.19086
                     LinearSVR  : 0.20403
         DecisionTreeRegressor  : -0.01024


## 5. Optimizing Regularization Strength of L1 and L2 Regression Models

In [90]:
# default alpha value is 1.0
# 
# Not only minimizes the cost, but the square of the weights of each feature
#
# alpha value of 0 is same as normal linear regression
l2_model = Ridge(alpha=0.1)
l2_model.fit(X_train, y_train)
print("L2 Reg. Model Score {}".format(l2_model.score(X_test,y_test)) )

L2 Reg. Model Score 0.22288091025341905


In [80]:
# L1 allows automatic feature selection
# by lowering the absolute value of the weight
# lowering bigger weights is same as lowering smaller weights
# Basically a shift: The smaller weights can actually be set all the way down to 0
l1_model = Lasso(alpha=1.0)

In [None]:
Lin=LinearRegression()
Lin.fit(X_train,y_train)
print(Lin.score(X_test, y_test))