You'll consider the types of models that would be appropriate to use given the features in your dataset and then dive right in and build two or three models. Please note: in addition to considering different algorithm types in your model selection, be sure to also consider applying model hyperparameter tuning operations. 

In [18]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
from scipy.stats import skew




#Model Building

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder,PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb


In [19]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, KFold,GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,classification_report
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder,PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor,StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.model_selection import KFold, cross_val_score,  train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA




In [3]:
pd.set_option('display.max_rows', 300)

# Lets import the data 

In [4]:
train= pd.read_csv('data /train_4.csv',index_col=0)

In [5]:
train.drop(['Unnamed: 0.1'],axis=1)

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MSSubClass_150
0,1,4.189655,9.042040,7,2003,2003,5.283204,6.561031,0.000000,5.017280,...,0,0,1,0,0,0,0,1,0,0
1,2,4.394449,9.169623,6,1976,1976,0.000000,6.886532,0.000000,5.652489,...,0,0,1,0,0,0,0,1,0,0
2,3,4.234107,9.328212,7,2001,2002,5.093750,6.188264,0.000000,6.075346,...,0,0,1,0,0,0,0,1,0,0
3,4,4.110874,9.164401,7,1915,1970,0.000000,5.379897,0.000000,6.293419,...,0,0,1,1,0,0,0,0,0,0
4,5,4.442651,9.565284,8,2000,2000,5.860786,6.486161,0.000000,6.196444,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,4.143135,8.976894,6,1999,2000,0.000000,0.000000,0.000000,6.860664,...,0,0,1,0,0,0,0,1,0,0
1456,1457,4.454347,9.486152,6,1978,1988,4.787492,6.673298,5.099866,6.380123,...,0,0,1,0,0,0,0,1,0,0
1457,1458,4.204693,9.109746,7,1941,2006,0.000000,5.620401,0.000000,6.777647,...,0,0,1,0,0,0,0,1,0,0
1458,1459,4.234107,9.181735,5,1950,1996,0.000000,3.912023,6.937314,0.000000,...,0,0,1,0,0,0,0,1,0,0


In [6]:
test = pd.read_csv('data /test_4.csv',index_col=0)

In [7]:
test.drop(['Unnamed: 0.1'],axis=1)

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MSSubClass_150
0,1461,4.394449,9.360741,5,1961,1961,0.000000,6.150603,4.976734,5.602119,...,0,0,1,0,0,0,0,1,0,0
1,1462,4.406719,9.565775,6,1958,1958,4.691348,6.828712,0.000000,6.008813,...,0,0,1,0,0,0,0,1,0,0
2,1463,4.317488,9.534668,5,1997,1998,0.000000,6.674561,0.000000,4.927254,...,0,0,1,0,0,0,0,1,0,0
3,1464,4.369448,9.208238,6,1998,1998,3.044522,6.401917,0.000000,5.783825,...,0,0,1,0,0,0,0,1,0,0
4,1465,3.784190,8.518392,8,1992,1992,0.000000,5.575949,0.000000,6.925595,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,3.091042,7.568896,4,1970,1970,0.000000,0.000000,0.000000,6.304449,...,0,0,1,0,0,0,0,1,0,0
1455,2916,3.091042,7.546974,4,1970,1970,0.000000,5.533389,0.000000,5.686975,...,0,0,1,1,0,0,0,0,0,0
1456,2917,5.081404,9.903538,5,1960,1996,0.000000,7.110696,0.000000,0.000000,...,0,0,1,1,0,0,0,0,0,0
1457,2918,4.143135,9.253591,5,1992,1992,0.000000,5.823046,0.000000,6.356108,...,0,0,1,0,0,0,0,1,0,0


In [8]:


y=train['SalePrice']
X=train.drop(['Id','SalePrice'],axis=1)



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
y_train

454     12.144203
577     12.010672
16      11.911708
918     12.380030
779     11.813037
          ...    
536     12.144203
1350    12.206078
12      11.877576
179     11.512935
1286    11.870607
Name: SalePrice, Length: 1092, dtype: float64

## Time to fit and evaluate what models work the best 

Going to be focused on using Linear Models and bagging/boosting 

# Linear Regression 

In [24]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)


In [25]:
lr = LinearRegression()
model = lr.fit(X_train,y_train)
y_head = lr.predict(X_test)
print('-'*10+'Linear Regression'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------Linear Regression----------
R square Accuracy:  -1.8051556611805786e+21
Mean Absolute Error Accuracy:  4160941820.234528
Mean Squared Error Accuracy:  2.853394935973735e+20


We see that we got a 91% accuracy rating which is fantastic 

# Fine Tuning the model 

In [None]:
#Ridge 

In [20]:
lm_ridge=Ridge()
parameters= {'alpha':[x for x in [0.0005,0.001,0.01,0.1,0.2,0.4,0.5,0.7,0.8,1]]}

lm_ridge=GridSearchCV(lm_ridge, param_grid=parameters)
lm_ridge.fit(X_train,y_train)
print("The best value of Alpha is: ",lm_ridge.best_params_,lm_ridge.best_score_)


The best value of Alpha is:  {'alpha': 1} 0.8923488261498436


In [21]:
lm_ridge_mod=Ridge(alpha=1)
lm_ridge_mod.fit(X_train,y_train)
y_pred_train=lm_ridge_mod.predict(X_train)
y_pred_test=lm_ridge_mod.predict(X_test)

print('Root Mean Square Error train = ' + str(np.sqrt(mean_squared_error(y_train, y_pred_train))))
print('Root Mean Square Error test = ' + str(np.sqrt(mean_squared_error(y_test, y_pred_test)))) 

Root Mean Square Error train = 0.08209557462569812
Root Mean Square Error test = 0.1267777646531488


In [None]:
#Lasso

In [22]:
lm_lasso =Lasso()
parameters= {'alpha':[x for x in [0.0005,0.001,0.01,0.1,0.2,0.4,0.5,0.7,0.8,1]]}

lm_lasso=GridSearchCV(lm_lasso, param_grid=parameters)
lm_lasso.fit(X_train,y_train)
print("The best value of Alpha is: ",lm_lasso.best_params_,lm_lasso.best_score_)

The best value of Alpha is:  {'alpha': 0.001} 0.9175836039686616


In [23]:
lm_lasso_mod =Lasso(alpha=0.0005)
lm_lasso_mod.fit(X_train,y_train)
y_pred_train=lm_lasso_mod.predict(X_train)
y_pred_test=lm_lasso_mod.predict(X_test)

print('Root Mean Square Error train = ' + str(np.sqrt(mean_squared_error(y_train, y_pred_train))))
print('Root Mean Square Error test = ' + str(np.sqrt(mean_squared_error(y_test, y_pred_test)))) 


Root Mean Square Error train = 0.08350455225778239
Root Mean Square Error test = 0.1205441411610745


In [None]:
#Elastic Net 

In [None]:
lgb_regressor=lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.035, n_estimators=2177, max_bin=50, bagging_fraction=0.65,bagging_freq=5, bagging_seed=7, 
                                feature_fraction=0.201, feature_fraction_seed=7,n_jobs=-1)
lgb_regressor.fit(X_train, y_train)
y_head=lgb_regressor.predict(X_test)
print('-'*10+'LGBM'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

In [None]:
gb_reg = GradientBoostingRegressor(n_estimators=1992, learning_rate=0.03005, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=14, loss='huber', random_state =42)
gb_reg.fit(X_train, y_train)
y_head=gb_reg.predict(X_test)
print('-'*10+'GBR'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

In [None]:
#Ridge Regression with CV 
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

alphas=[1e-9,1e-8,1e-7,1e-6]

ridgecv_reg= make_pipeline(RidgeCV(alphas=alphas, cv=kfolds))
ridgecv_reg.fit(X_train, y_train)
y_head=ridgecv_reg.predict(X_test)
print('-'*10+'RidgeCV'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

In [None]:
#Elastic Regression with CV 
kfolds = KFold(n_splits=8, shuffle=True, random_state=42)

alphas=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006]
l1ratio=[0.87, 0.9,0.92, 0.95,0.97, 0.99, 1]

elasticv_reg= make_pipeline(ElasticNetCV(alphas=alphas, cv=kfolds, l1_ratio=l1ratio))
elasticv_reg.fit(X_train, y_train)
y_head=elasticv_reg.predict(X_test)
print('-'*10+'ElasticNetCV'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

In [None]:
#creating predict column and adding it to DF to show actual vs predicted
y_head=pd.DataFrame(y_head,columns=['Predict'])
y_test.reset_index(drop=True,inplace=True)
y_test_y_head=pd.concat([y_test,y_head],axis=1)
y_test_y_head.head()

for submission 

In [None]:
prediction= y_test_y_head.to_csv('data /prediction')

will add more documentation to explain flow and thought process but for now wanted to submit all work currently done in order for it to be reviewed. 

ElasticNet scored the highest R Squared accuracy and the lowest error accuracy as well. 

In [None]:
# This second draft has scored higher than the previous draft accross all models 

In [None]:
add stacking regressor. 