In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import time
import statistics

In [29]:
from sklearn import model_selection, preprocessing, linear_model, metrics

In [30]:
from sklearn.model_selection import StratifiedShuffleSplit
from scipy import sparse

In [31]:
AmesClean = pd.read_csv('AmesCleanDataSet.csv')
AmesClean.drop(['Unnamed: 0'],axis=1,inplace=True)
print('Ames clean dataset has size of :' + str(AmesClean.shape))

Ames clean dataset has size of :(1156, 59)


In [32]:
AmesClean['salePriceCat'] = pd.cut(AmesClean.SalePrice,bins=10,labels = range(10)).values

In [33]:
categorical_features = ['MS_SubClass','MS_Zoning','Lot_Shape','Land_Contour','Lot_Config','Land_Slope']
categorical_features.extend(['Neighborhood','Condition_1','Bldg_Type','House_Style','Roof_Style'])
categorical_features.extend(['Mas_Vnr_Type','Exter_Qual','Exter_Cond','Foundation','Bsmt_Qual','Bsmt_Exposure'])
categorical_features.extend(['basement_type','Heating_QC','Garage_Finish','Mo_Sold','Sale_Type','Sale_Condition','Kitchen_Qual'])
categorical_features.extend(['exterior','Fireplace_Qu','Garage_Type','Garage_Qual'])
Ordinal_featues = ['Overall_Qual','Overall_Cond']
Continous_features = ['Lot_Frontage','Lot_Area','age','remodeled_age','Mas_Vnr_Area','basement_area']
Continous_features.extend(['Bsmt_Unf_SF','Total_Bsmt_SF','1st_Flr_SF','2nd_Flr_SF','Low_Qual_Fin_SF'])
Continous_features.extend(['Gr_Liv_Area','Bsmt_Full_Bath','Bsmt_Half_Bath','Full_Bath','Half_Bath'])
Continous_features.extend(['Bedroom_AbvGr','Kitchen_AbvGr','TotRms_AbvGrd','Fireplaces'])               
Continous_features.extend(['Garage_Cars','Garage_Area'])
Continous_features.extend(['Wood_Deck_SF','Open_Porch_SF','Enclosed_Porch','Screen_Porch','Pool_Area'])
Continous_features.extend(['SalePrice','garage_age'])

In [34]:
AmesCleanFinal = AmesClean.drop(categorical_features,axis=1)
AmesCleanFinalData = AmesCleanFinal.values
AmesCleanCol = AmesCleanFinal.columns

for ifeature in categorical_features:
    
    temp_df = pd.get_dummies(AmesClean.loc[:,ifeature],prefix=ifeature,prefix_sep='_')
    AmesCleanCol = list(AmesCleanCol) + list(temp_df.columns)
    temp_data = temp_df.values
    AmesCleanFinalData = np.hstack([AmesCleanFinalData,temp_data])
    
AmesCleanDumClean = pd.DataFrame(AmesCleanFinalData, columns = AmesCleanCol)

In [35]:
X = AmesCleanDumClean.drop(['SalePrice','salePriceCat'],axis=1)
Y = AmesCleanDumClean.SalePrice

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [37]:
scaler = StandardScaler()

In [38]:
X_scale = scaler.fit_transform(X)

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale,Y,test_size = 0.33)

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
from sklearn.linear_model import SGDRegressor

In [42]:
sgd_grid = SGDRegressor()

In [43]:
sgd_grid.get_params()

{'alpha': 0.0001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'invscaling',
 'loss': 'squared_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [44]:
?sgd_grid

In [45]:
alpha_values   = [10**i for i in np.arange(0,5,0.5)]
l1_ratio       = np.arange(0.1,0.9,0.1)
epsilon_values = [10**i for i in np.arange(-2,3,0.5)]

sgd_grid.set_params(max_iter=1e5,shuffle=True,random_state=144,early_stopping=True,average=True)

grid_param = {'loss':['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
             'penalty':['l1','l2'],'alpha':alpha_values,'epsilon':epsilon_values}

In [46]:
sgd_model_GCV = GridSearchCV(sgd_grid,grid_param,cv=5,return_train_score=True,n_jobs=-1)
%time sgd_model_GCV.fit(X_train,Y_train)

CPU times: user 6.24 s, sys: 709 ms, total: 6.95 s
Wall time: 16min 10s


GridSearchCV(cv=5,
             estimator=SGDRegressor(average=True, early_stopping=True,
                                    max_iter=100000.0, random_state=144),
             n_jobs=-1,
             param_grid={'alpha': [1.0, 3.1622776601683795, 10.0,
                                   31.622776601683793, 100.0,
                                   316.22776601683796, 1000.0,
                                   3162.2776601683795, 10000.0,
                                   31622.776601683792],
                         'epsilon': [0.01, 0.03162277660168379, 0.1,
                                     0.31622776601683794, 1.0,
                                     3.1622776601683795, 10.0,
                                     31.622776601683793, 100.0,
                                     316.22776601683796],
                         'loss': ['squared_loss', 'huber',
                                  'epsilon_insensitive',
                                  'squared_epsilon_insensitive'],
  

In [47]:
df = pd.DataFrame(sgd_model_GCV.cv_results_)
df.to_csv('SGDGridSearch.csv')

In [48]:
np.max(df.mean_train_score.values)

0.8327763584626823

In [49]:
np.min(df.mean_train_score.values)

-58279.549895212775

In [50]:
sgd_model_GCV.best_params_

{'alpha': 3.1622776601683795,
 'epsilon': 0.01,
 'loss': 'squared_epsilon_insensitive',
 'penalty': 'l2'}

In [51]:
sgd_model_GCV.best_score_

0.7844092195922248

In [52]:
sgd_model_GCV.best_estimator_.score(X_train,Y_train)

0.8298732325572342

In [53]:
sgd_model_GCV.best_estimator_.score(X_test,Y_test)

0.8118979244219118

In [54]:
sgd_model_GCV.best_params_

{'alpha': 3.1622776601683795,
 'epsilon': 0.01,
 'loss': 'squared_epsilon_insensitive',
 'penalty': 'l2'}

To do : <br>
1. Plot error for training and testing versus y <br>
2. Plot QQ plot <br>