You'll consider the types of models that would be appropriate to use given the features in your dataset and then dive right in and build two or three models. Please note: in addition to considering different algorithm types in your model selection, be sure to also consider applying model hyperparameter tuning operations. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
from scipy.stats import skew




#Model Building

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder,PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, KFold,GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import StandardScaler,RobustScaler,LabelEncoder,PowerTransformer
from sklearn.ensemble import GradientBoostingRegressor,StackingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.model_selection import KFold, cross_val_score,  train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA




In [3]:
pd.set_option('display.max_rows', 300)

# Lets import the data 

In [4]:
train= pd.read_csv('data /train_4.csv',index_col=0)

In [5]:
train.drop(['Unnamed: 0.1'],axis=1)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.730463,2.885846,5.831328,19.212182,0.730463,0.730463,1.540963,0.0,2.440268,1.820334,...,0,0,0,1,0,0,0,0,1,0
1,1.194318,2.055642,6.221214,19.712205,0.730463,0.730463,1.540963,0.0,2.259674,2.440268,...,0,0,0,1,0,0,0,0,1,0
2,1.540963,2.885846,5.914940,20.347241,0.730463,0.730463,0.000000,0.0,2.440268,1.820334,...,0,0,0,1,0,0,0,0,1,0
3,1.820334,3.011340,5.684507,19.691553,0.730463,0.730463,0.000000,0.0,2.440268,1.820334,...,0,0,0,1,1,0,0,0,0,0
4,2.055642,2.885846,6.314735,21.325160,0.730463,0.730463,0.000000,0.0,2.602594,1.820334,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,13.213850,2.885846,5.744420,18.960528,0.730463,0.730463,1.540963,0.0,2.259674,1.820334,...,0,0,0,1,0,0,0,0,1,0
1456,13.215896,2.055642,6.337529,20.994868,0.730463,0.730463,1.540963,0.0,2.259674,2.055642,...,0,0,0,1,0,0,0,0,1,0
1457,13.217941,3.011340,5.859551,19.476345,0.730463,0.730463,1.540963,0.0,2.440268,2.602594,...,0,0,0,1,0,0,0,0,1,0
1458,13.219985,2.055642,5.914940,19.760176,0.730463,0.730463,1.540963,0.0,2.055642,2.055642,...,0,0,0,1,0,0,0,0,1,0


In [6]:
test = pd.read_csv('data /test_4.csv',index_col=0)

In [7]:
test.drop(['Unnamed: 0.1'],axis=1)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,13.224069,2.055642,6.221214,20.479373,0.730463,0.730463,1.540963,0.000000,2.055642,2.055642,...,0,0,0,1,0,0,0,0,1,0
1,13.226109,2.055642,6.244956,21.327220,0.730463,0.730463,0.000000,0.000000,2.259674,2.055642,...,0,0,0,1,0,0,0,0,1,0
2,13.228148,2.885846,6.073289,21.196905,0.730463,0.730463,0.000000,0.000000,2.055642,1.820334,...,0,0,0,1,0,0,0,0,1,0
3,13.230186,2.885846,6.172972,19.865444,0.730463,0.730463,0.000000,0.000000,2.259674,2.055642,...,0,0,0,1,0,0,0,0,1,0
4,13.232223,0.000000,5.093857,17.257255,0.730463,0.730463,0.000000,0.000000,2.602594,1.820334,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,15.394418,1.194318,3.932510,14.081426,0.730463,0.730463,1.540963,0.000000,1.820334,2.259674,...,0,0,0,1,0,0,0,0,1,0
1455,15.395553,1.194318,3.932510,14.013314,0.730463,0.730463,1.540963,0.000000,1.820334,1.820334,...,0,0,0,1,1,0,0,0,0,0
1456,15.396687,2.055642,7.620056,22.782058,0.730463,0.730463,1.540963,0.000000,2.055642,2.259674,...,0,0,0,1,1,0,0,0,0,0
1457,15.397821,3.340760,5.744420,20.046557,0.730463,0.730463,1.540963,0.000000,2.055642,1.820334,...,0,0,0,1,0,0,0,0,1,0


In [8]:


y=train['SalePrice']
X=train.drop(['Id','SalePrice'],axis=1)



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
y_train

860     3.145615
1215    3.098150
905     3.100875
1349    3.095354
508     3.127014
          ...   
564     3.183708
887     3.107401
403     3.179541
916     2.946235
1041    3.135125
Name: SalePrice, Length: 1092, dtype: float64

# Time to fit and evaluate what models work the best 

Going to be focused on using Linear Models and bagging/boosting 

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)


In [11]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [12]:
y_head= lr.predict(X_test)
print('-'*10+'Linear Regression'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------Linear Regression----------
R square Accuracy:  -2.0991885801179615e+21
Mean Absolute Error Accuracy:  215248649.85853496
Mean Squared Error Accuracy:  3.841506515103224e+18


Very interesting. I didn't expect the answer to be negative considering the other complex models did so well. Maybe an error on my part ? 

In [13]:
# I believe this is a way to represent the strength of the correlation between the features and the target variable which is the sale price. 
coeff_df = pd.DataFrame(lr.coef_, X.columns, columns=['Coefficient']).sort_values(['Coefficient'],ascending=False)
coeff_df.head(10)
#print(coeff_df)

Unnamed: 0,Coefficient
BldgType_1Fam,3785749000.0
BldgType_TwnhsE,2726930000.0
BldgType_Duplex,1958634000.0
BldgType_Twnhs,1749096000.0
Condition2_Norm,1742404000.0
BldgType_2fmCon,1414929000.0
Condition2_Feedr,1164281000.0
Condition2_Artery,824027500.0
Condition2_PosA,582942700.0
Condition2_RRAn,582942700.0


In [14]:
lgb_regressor=lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.035, n_estimators=2177, max_bin=50, bagging_fraction=0.65,bagging_freq=5, bagging_seed=7, 
                                feature_fraction=0.201, feature_fraction_seed=7,n_jobs=-1)
lgb_regressor.fit(X_train, y_train)
y_head=lgb_regressor.predict(X_test)
print('-'*10+'LGBM'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------LGBM----------
R square Accuracy:  0.9251846320291255
Mean Absolute Error Accuracy:  0.008797972370895588
Mean Squared Error Accuracy:  0.0001369118173622158


In [15]:
gb_reg = GradientBoostingRegressor(n_estimators=1992, learning_rate=0.03005, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=14, loss='huber', random_state =42)
gb_reg.fit(X_train, y_train)
y_head=gb_reg.predict(X_test)
print('-'*10+'GBR'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------GBR----------
R square Accuracy:  0.9282867455467742
Mean Absolute Error Accuracy:  0.008108960278479415
Mean Squared Error Accuracy:  0.00013123496231379154


In [16]:
#Ridge Regression with CV 
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

alphas=[1e-9,1e-8,1e-7,1e-6]

ridgecv_reg= make_pipeline(RidgeCV(alphas=alphas, cv=kfolds))
ridgecv_reg.fit(X_train, y_train)
y_head=ridgecv_reg.predict(X_test)
print('-'*10+'RidgeCV'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------RidgeCV----------
R square Accuracy:  0.8869171661767622
Mean Absolute Error Accuracy:  0.009764841142281782
Mean Squared Error Accuracy:  0.00020694112334295007


In [18]:
#Elastic Regression with CV 
kfolds = KFold(n_splits=8, shuffle=True, random_state=42)

alphas=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006]
l1ratio=[0.87, 0.9,0.92, 0.95,0.97, 0.99, 1]

elasticv_reg= make_pipeline(ElasticNetCV(alphas=alphas, cv=kfolds, l1_ratio=l1ratio))
elasticv_reg.fit(X_train, y_train)
y_head=elasticv_reg.predict(X_test)
print('-'*10+'ElasticNetCV'+'-'*10)
print('R square Accuracy: ',r2_score(y_test,y_head))
print('Mean Absolute Error Accuracy: ',mean_absolute_error(y_test,y_head))
print('Mean Squared Error Accuracy: ',mean_squared_error(y_test,y_head))

----------ElasticNetCV----------
R square Accuracy:  0.9313594641741718
Mean Absolute Error Accuracy:  0.008223376644544318
Mean Squared Error Accuracy:  0.00012561189979428994


In [19]:
#creating predict column and adding it to DF to show actual vs predicted
y_head=pd.DataFrame(y_head,columns=['Predict'])
y_test.reset_index(drop=True,inplace=True)
y_test_y_head=pd.concat([y_test,y_head],axis=1)
y_test_y_head.head()

Unnamed: 0,SalePrice,Predict
0,3.150534,3.142652
1,3.16689,3.168636
2,3.096294,3.105577
3,3.15942,3.15856
4,3.115331,3.118967


for submission 

In [20]:
prediction= y_test_y_head.to_csv('data /prediction')

will add more documentation to explain flow and thought process but for now wanted to submit all work currently done in order for it to be reviewed. 

ElasticNet scored the highest R Squared accuracy and the lowest error accuracy as well. 

In [None]:
# This second draft has scored higher than the previous draft accross all models 