In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit


In [None]:

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [None]:
df=pd.concat([pd.DataFrame(data),pd.DataFrame(target,columns=['score'])],axis=1)
display(df.head(2))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,score
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6


In [None]:

X = df.drop('score', axis=1)
y = df["score"]

In [None]:

categorical_columns = []
numerical_columns = []

for column_name in X.columns:
    if (X[column_name].dtypes == object):
        categorical_columns +=[column_name]
    else:
        numerical_columns +=[column_name]

print('categorical columns:\t ',categorical_columns, '\n len = ',len(categorical_columns))

print('numerical columns:\t ',  numerical_columns, '\n len = ',len(numerical_columns))

categorical columns:	  [] 
 len =  0
numerical columns:	  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 
 len =  13


In [None]:
categorical_pipeline = Pipeline([
    ('encoder', OrdinalEncoder( ))
])

numerical_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler())
])

preprocessors = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

In [None]:
pipeline_ridge = Pipeline([
    ('model', Ridge())
])

pipeline_lasso = Pipeline([
    ('model', Lasso())
])

lasso_params = {'model__alpha':np.logspace(-1, 2, 20)}
ridge_params = {'model__alpha':np.logspace(-1, 2, 20)}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size= 0.2)
X_train = preprocessors.fit_transform(X_train)
X_test = preprocessors.transform(X_test)


models = { 'LR': LinearRegression(),
           'Lasso': GridSearchCV(estimator=pipeline_lasso,
                      param_grid=lasso_params,
                      cv=ShuffleSplit(n_splits=5,
                                      random_state=42)).fit(X_train,
                                                           y_train).best_estimator_,
           'Ridge': GridSearchCV(estimator=pipeline_ridge,
                        param_grid=ridge_params,
                        cv=ShuffleSplit(n_splits=5,
                                        random_state=42)).fit(X_train,
                                                              y_train).best_estimator_,}

models = { 'LR': models['LR'],
           'Lasso': models['Lasso']['model'],
           'Ridge': models['Ridge']['model'],}

In [None]:
models['LR']

In [None]:
results = {}
r2_train = []
r2_test = []

for _, model in models.items():
  model.fit(X_train, y_train)
  r2_test.append(metrics.r2_score(y_test,
                                  model.predict(X_test)))
  r2_train.append(metrics.r2_score(y_train,
                                  model.predict(X_train)))
  results[model] = [np.mean(r2_train), np.mean(r2_test)]

display(pd.DataFrame(results,index=['Train',"Test"]))

Unnamed: 0,LinearRegression(),Lasso(alpha=0.1),Ridge(alpha=0.1)
Train,0.933388,0.882808,0.897838
Test,0.869495,0.866312,0.864134
