In [143]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [144]:
num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)



In [145]:
mpg_df = pd.read_csv("d:\gli\dse\data\car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

# separate independent and dependent variables

In [146]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [147]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  

# fit a simple linear model

In [155]:
regression_model = LinearRegression()
regression_model.fit(X_scaled, y)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 1.9982149205072222
The coefficient for disp is 2.206521786578064
The coefficient for hp is -1.2649722774362342
The coefficient for wt is -5.445976639924107
The coefficient for acc is 0.24602531758755442
The coefficient for yr is 2.784478311223781
The coefficient for car_type is 2.6950285164212056
The coefficient for origin_america is -0.6059775835828981
The coefficient for origin_asia is 0.4517275182026228
The coefficient for origin_europe is 0.29704658979519943


In [151]:
# Never rely on a single train testing strategy to assess your models.... Always use KFold cross validation

In [198]:
# Kfold of simple multivariate linear regression model
results = cross_val_score(regression_model, X_scaled, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.65143127 0.8136909  0.63774139 0.84091741 0.72031897 0.8457063
 0.64446763 0.76018021 0.10178309 0.61261918]
Accuracy: 66.289% (20.472%)


# Create a regularized RIDGE model and note the coefficients

In [199]:
ridge = Ridge(alpha=.7)

In [200]:
# Kfold of simple multivariate linear regression model
results = cross_val_score(ridge, X_scaled, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.65093463 0.81499393 0.63838428 0.8439618  0.72588363 0.84599201
 0.6412884  0.76217418 0.09514306 0.60909032]
Accuracy: 66.278% (20.730%)


# Create a regularized LASSO model and note the coefficients

In [202]:
lasso = Lasso(alpha=.07)

In [203]:
# Kfold of simple multivariate linear regression model
results = cross_val_score(lasso, X_scaled, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[ 0.66096895  0.80539631  0.63812765  0.85342217  0.76330937  0.85165164
  0.56939956  0.78985129 -0.00233476  0.54871267]
Accuracy: 64.785% (24.106%)


# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [204]:
from sklearn.preprocessing import PolynomialFeatures

In [205]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)

In [206]:
X_poly = poly.fit_transform(X_scaled)

# Fit a simple non regularized linear model on poly features-

In [207]:
# Kfold of simple multivariate linear regression model
results = cross_val_score(regression_model, X_poly, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.69788005 0.86258712 0.81480224 0.86582075 0.76576289 0.90859365
 0.72457321 0.82284647 0.41054372 0.50516157]
Accuracy: 73.786% (15.429%)


## Ridge Regularization of the polynomial model

In [287]:
# Kfold of Ridge
ridge = Ridge(alpha=.94 , solver ='saga')
results = cross_val_score(ridge, X_poly, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.73605869 0.8673232  0.78854211 0.87925526 0.81136232 0.90569707
 0.72188521 0.83875615 0.36938742 0.60326004]
Accuracy: 75.215% (15.336%)


## Lasso Regularization of the polynomial model

In [278]:
# Kfold of Lasso
lasso = Lasso(alpha=.06, max_iter=1000,)
results = cross_val_score(lasso, X_poly, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.77952513 0.86837539 0.73091964 0.89519569 0.82528781 0.91005789
 0.79786127 0.84921774 0.27774557 0.6583603 ]
Accuracy: 75.925% (17.616%)


In [None]:
# Since polynomial regression models are performing much better than the model on orginal data, let us do hyper parameter tuning
# on these models

## Hyper Parameter Tuning for polynomial feature based linear regression model

In [279]:
from scipy.stats import randint as sp_rand
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [280]:
ridge_param_dist = {"alpha": np.arange(start = 0.01, stop = 1, step =.01),
              "solver" : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
             }


lasso_param_dist = {"alpha": np.arange(start = 0.01, stop = 1, step =.01)}

## Lasso Random Search 

In [281]:
# run randomized search
samples = 50  # number of random samples 
lasso_randomCV = RandomizedSearchCV(lasso, param_distributions=lasso_param_dist, n_iter=samples, cv=10)

In [282]:
lasso_randomCV.fit(X_poly, y)

print(lasso_randomCV.best_params_)

{'alpha': 0.060000000000000005}


## Ridge Random Search

In [283]:
# run randomized search
samples = 50  # number of random samples 
ridge_randomCV = RandomizedSearchCV(ridge, param_distributions=ridge_param_dist, n_iter=samples, cv=10)

In [284]:
ridge_randomCV.fit(X_poly, y)

print(ridge_randomCV.best_params_)

{'solver': 'saga', 'alpha': 0.9400000000000001}
