In [1]:
# MSDS 422, Section 58, Assignment 3, Alan Kessler
# Python 3.5 on Mac OS 10.13.5 edited in Atom
# Demonstrates use of regularized regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import norm
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

# Suppress scipy runtime warning
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

# Set to get balance of size correctly
plt.rcParams['figure.dpi'] = 100
# Set random seed to reproduce results
seed = 2107
# Set number of cross-validation splits
splits = 5

# Load the data
train = pd.read_csv('boston.csv')

# Drop the neighborhood variable based on assignment instructions
train.drop(['neighborhood'], axis=1, inplace=True)

# Plot target to assess need for transformation
ax = sns.distplot(train['mv'], fit=norm, kde=False)
ax.set(xlabel='Median Home Value ($000)')
plt.title("Target Distribution with Normal Curve")
eda = PdfPages('target_visualization.pdf')
eda.savefig()
plt.close()

# Plot log-transformed target
ax = sns.distplot(np.log(train['mv']), fit=norm, kde=False)
ax.set(xlabel='Log Median Home Value ($000)')
plt.title("Log-Target Distribution with Normal Curve")
eda.savefig()
plt.close()
eda.close()

# Define scaled predictor variables as an array for sklearn use
X = StandardScaler().fit_transform(train.drop(['mv'], axis=1).values)

# Define log-target as an array for sklearn use
y = np.log(train['mv']).values

# Use KFold cross-validation to split data for training
cv = KFold(n_splits=splits, shuffle=False, random_state=seed)

# Construct a list of models to iterate over
models = [LinearRegression(), Ridge(), Lasso(), ElasticNet()]

# Initialize list to store stats from each fold and model
stats = []

# Iterate over the folds and models to save OoF RMSE for each
i = 0
for training, test in cv.split(X, y):
    fold = [i]
    for alg in models:
        scores = alg.fit(X[training], y[training]).predict(X[test])
        rmse = mean_squared_error(y[test], scores)**2
        fold.append(rmse)
    stats.append(fold)
    i += 1

# Create a dataframe of the OoF RMSE for each model
labels = ['RMSE Fold', 'Linear Regression', 'Ridge Regression',
          'Lasso Regression', 'Elastic Net Regression']
results = pd.DataFrame(stats, columns=labels)

# Add mean and standard deviation to summary
m = results.mean().tolist()
m[0] = "Mean"
s = results.std().tolist()
s[0] = "Std Dev"

results_summary = results.append([pd.Series(m, index=labels),
                                  pd.Series(s, index=labels)],
                                 ignore_index=True).set_index('RMSE Fold')

# Save CV results to a CSV
results_summary.to_csv("results_summary.csv")

# Calculate the coefficients from the ridge regression
full_model = Ridge().fit(X, y)
labels = train.columns[0:12].tolist()
ridge_coef = pd.DataFrame([full_model.coef_.tolist()], columns=labels)

# Save coefficients to a CSV
ridge_coef.to_csv("results_coef.csv")


In [2]:
results_summary

Unnamed: 0_level_0,Linear Regression,Ridge Regression,Lasso Regression,Elastic Net Regression
RMSE Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.000346,0.000339,0.004336,0.004336
1,0.001485,0.00144,0.01873,0.01873
2,0.001004,0.000992,0.055127,0.055127
3,0.009121,0.009175,0.061669,0.061669
4,0.003213,0.00321,0.075685,0.075685
Mean,0.003034,0.003031,0.043109,0.043109
Std Dev,0.003565,0.003596,0.030198,0.030198


In [3]:
ridge_coef

Unnamed: 0,crim,zn,indus,chas,nox,rooms,age,dis,rad,tax,ptratio,lstat
0,-0.091874,0.028801,0.014279,0.027505,-0.093775,0.058702,0.008935,-0.106709,0.111996,-0.107072,-0.077702,-0.217411
