In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
import pickle
import csv
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
np.random.seed(42)
%matplotlib inline



# Importing my already scaled data

In [2]:
X_train_sc = pd.read_csv('../data/X_train_sc.csv')
X_test_sc = pd.read_csv('../data/X_test_sc.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Instantiaing  and fitting my model

In [3]:
ridge = RidgeCV(alphas = np.logspace(0,5,200))

In [5]:
ridge.fit(X_train_sc, y_train)

RidgeCV(alphas=array([1.00000e+00, 1.05956e+00, ..., 9.43788e+04, 1.00000e+05]),
    cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=False)

# Scoring my model

In [6]:
ridge.score(X_train_sc,y_train)

0.9291470853299533

In [15]:
ridge.score(X_test_sc, y_test)

0.9075382607320102

# Calculating RMSE for my model

In [7]:
y_hat_train = ridge.predict(X_train_sc)
y_hat_test = ridge.predict(X_test_sc)

In [8]:
mean_squared_error(y_train, y_hat_train) ** .5

21175.304119324293

In [9]:
mean_squared_error(y_test, y_hat_test) ** .5

23840.519127081046

# Creating a dataframe of coefficients and their weights

In [10]:
with open('../assets/columns.pkl', 'rb') as f:
    columns = pickle.load(f)

In [19]:
coef = pd.DataFrame(ridge.coef_.T, index = columns, columns=['weight'])
coef.sort_values(by='weight', ascending=False).head(20)

Unnamed: 0,weight
Gr Liv Area,5677.928829
Overall Qual_9,5012.600592
1st Flr SF,4951.475698
Total Bsmt SF,4647.959652
Exter Qual_Ex,4276.071843
Kitchen Qual_Ex,4067.78905
Neighborhood_StoneBr,3680.912243
TotRms AbvGrd,3467.828762
Garage Cars_3.0,3445.575074
Full Bath_3,3400.403867


In [20]:
coef = pd.DataFrame(ridge.coef_.T, index = columns, columns=['weight'])
coef.sort_values(by='weight', ascending=True).head(20)

Unnamed: 0,weight
Misc Feature_Elev,-5225.70447
Roof Matl_ClyTile,-4982.628583
Exter Qual_TA,-2326.49486
Overall Cond_3,-2150.482122
Overall Qual_5,-2127.205277
Misc Val,-2047.7298
Neighborhood_Edwards,-1994.896293
Kitchen Qual_TA,-1875.381031
BsmtFin Type 1_Unf,-1769.857547
Bsmt Exposure_No,-1642.229286


# Saving my model for future use

In [14]:
with open('../assets/ridge.pkl', 'wb+') as f:
    pickle.dump(ridge, f)