### Load the libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from joblib import dump

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

from sklearn.linear_model import ElasticNet, LinearRegression

### Load the data

In [2]:
df = pd.read_csv('../data/raw/train.csv')
df.head()
#df.describe()
#df.info()

FileNotFoundError: [Errno 2] File b'../data/raw/train.csv' does not exist: b'../data/raw/train.csv'

### Prepare the data

In [6]:
df_cleaned = df.copy()
# Strip the column names if they contain spaces
df_cleaned.columns = df_cleaned.columns.str.strip()

# Extract the target column
target = df_cleaned.pop('TARGET_5Yrs')
#df_cleaned.head()

# Scaling
scaler = StandardScaler()
df_cleaned = scaler.fit_transform(df_cleaned)
#df_cleaned

# Save the scaler 
dump(scaler, '../models/scaler.joblib')

# Split the original dataset into Data-Test set
X_data, X_test, y_data, y_test = train_test_split(df_cleaned, target, test_size = 0.2, random_state=8 )

# Split X-data again into Train-Validation set 
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size = 0.2, random_state=8)

# Save the splited data
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val', X_val)
np.save('../data/processed/X_test', X_test)

np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val', y_val)
np.save('../data/processed/y_test', y_test)

### Baseline Model

In [7]:
## Get the mean value of Target 1 and zero
y_mean = y_train.mean() # 0.837109375

## Replace the Target value 1 with mean value
y_base = np.full((len(y_train), 1), y_mean)

## Check - How far the mean value is from the actual y -value 
print("MSE", mse(y_train, y_base)) # 0.1363572692871094
print("MAE", mae(y_train, y_base)) # 0.27271453857421873

MSE 0.1363572692871094
MAE 0.27271453857421873


### 1.Build The Elastics Model

In [44]:
reg = ElasticNet()
reg.fit(X_train, y_train)
dump(reg, '../models/elasticnet_default.joblib')

['../models/elasticnet_default.joblib']

### Make Predictions

In [46]:
y_train_prediction = reg.predict(X_train)
y_val_prediction = reg.predict(X_val)

### Check Model Accuracy on Training Data Set

In [50]:
print("MSE: y-train", mse(y_train, y_train_prediction)) #0.1363572692871094
print("MAE: y-train", mae(y_train, y_train_prediction)) #0.27271453857421873

MSE: y-train 0.1363572692871094
MAE: y-train 0.27271453857421873


### Check Model Accuracy on Validation Data Set

In [53]:
print("MSE: y-val", mse(y_val, y_val_prediction)) #0.1529493713378906
print("MAE: y-val", mae(y_val, y_val_prediction)) #0.289306640625

MSE: y-val 0.1529493713378906
MAE: y-val 0.289306640625


### 2. Build Polynomial Linear Regression

In [9]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)

lm_reg = LinearRegression().fit(X_train_poly, y_train)
dump(lm_reg, '../models/linear_regression_default.joblib')

['../models/linear_regression_default.joblib']

### Make Prediction and Check Accuracy on Training Set

In [15]:
y_train_lm_prediction = lm_reg.predict(X_train_poly)
print("MSE: y-train", mse(y_train, y_train_lm_prediction))
print("MAE: y-train", mae(y_train, y_train_lm_prediction))

MSE: y-train 0.12006841630924078
MAE: y-train 0.24802284888370174


### Make Prediction and Check Accuracy on Validation Set

In [19]:
X_val_poly = poly.fit_transform(X_val)
y_val_lm_prediction = lm_reg.predict(X_val_poly)
print("MSE: y-val", mse(y_val, y_val_lm_prediction))
print("MAE: y-val", mae(y_val, y_val_lm_prediction))

MSE: y-val 0.14782696021184988
MAE: y-val 0.27599430507122025
