# Modelling

## Import Library

In [1]:
# import libraries

#maths
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#from pandas_profiling import ProfileReport as pp
import seaborn as sns
import scipy.stats as stats
import os, fnmatch


sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, LassoCV, RidgeCV,Ridge

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_squared_error
# import warnings
# warnings.filterwarnings(action='once')

In [2]:
#re-import the dataset
train = pd.read_csv('../datasets/train_processed.csv')

test = pd.read_csv('../datasets/test_processed.csv')

In [3]:
# check the shape of datasets
print(train.shape)
print(test.shape)


(2047, 181)
(879, 180)


## Features to model

In [4]:
features=train.columns.drop(['saleprice','id'])

In [5]:
#instantiate linearregression
lr=LinearRegression()

In [6]:
# first attempt to create a MLR without any transformation
X=train[features]
y=train['saleprice']

In [7]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=15)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1637, 179), (410, 179), (1637,), (410,))

In [8]:
ss= StandardScaler()
X_train_scaled=ss.fit_transform(X_train)
X_test_scaled=ss.transform(X_test)

In [9]:
#Linear Regression Fitting without any transformation
linreg_scores=cross_val_score(lr, X_train_scaled, y_train, cv=10)
lr.fit(X_train_scaled, y_train)
r2_score_ling=np.mean(linreg_scores)


In [63]:
print('---Linear Regression---')
print('R2:{}'.format(r2_score_ling))

---Linear Regression---
R2:-4.377466946216419e+19


In [10]:
# model=lr.fit(X,y)

In [11]:
# predictions = model.predict(X)
# score = model.score(X,y)
# print(score)

<br> This score also suggest that there may be co-linear relation among the features. We may re-look at the features to reduce them</br>

In [12]:
# X_test_features=test.columns.drop(['id'])

In [13]:
# X_test=test[X_test_features]

In [14]:
# prediction=model.predict(X_test)

In [15]:
# df= pd.DataFrame(test['id'])

In [16]:
# df['saleprice']=prediction

In [17]:
# df.set_index(['id'],inplace=True)

In [18]:
# df.to_csv('../datasets/sub1.csv')

>First submission: 
<br>Private score:29200.01198</br>
<br>Public score: 27163.23540</br>

In [19]:
#this function gerenrate csv with prediction scores
def sub_df(predict,filepath):
    df=pd.DataFrame(test['id'])
    df['saleprice']=predict
    df.set_index(['id'],inplace=True)
    df.to_csv(filepath)

## Model Preparation: Train/Test Split

***Ridge or Lasso Regression***

In [20]:
#Lasso
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=15)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1637, 179), (410, 179), (1637,), (410,))

In [21]:
ss= StandardScaler()
X_train_scaled=ss.fit_transform(X_train)
X_test_scaled=ss.transform(X_test)


In [22]:
# linear


In [23]:
lasso_search=LassoCV(n_alphas=300)
lasso_search.fit(X_train_scaled,y_train)
lasso=Lasso(alpha=lasso_search.alpha_,tol=0.1)



In [24]:
score= cross_val_score(lasso,X_train_scaled,y_train,cv=10)
lasso.fit(X_train_scaled,y_train)
pred=lasso.predict(X_test_scaled)
r2score_lasso=r2_score(y_test,pred)
rmse_lasso=np.sqrt(mean_squared_error(y_test,pred))

In [25]:
print('---Lasso---')
print(r2score_lasso)
print(rmse_lasso)

---Lasso---
0.8830015715950751
26453.620443281423


In [26]:
ridge_search=RidgeCV(alphas=np.logspace(0,5,200),store_cv_values=True)
ridge_search.fit(X_train_scaled,y_train)
ridge=Ridge(alpha=ridge_search.alpha_)

In [27]:
score2=cross_val_score(ridge,X_train_scaled,y_train,cv=10)
ridge.fit(X_train_scaled,y_train)
pred=ridge.predict(X_test_scaled)
r2score=r2_score(y_test,pred)
rmse=np.sqrt(mean_squared_error(y_test,pred))

In [28]:
print('---Ridge---')
print(r2score)
print(rmse)

---Ridge---
0.8857236890065557
26144.07074830242


In [29]:
ridge_coeff=dict(zip(features,ridge.coef_))

In [30]:
df_ridge_coeff=pd.DataFrame(list(ridge_coeff.items()),columns=['features','coefficient'])

In [31]:
df_ridge_coeff['abs_coefficient']=abs(df_ridge_coeff['coefficient'])

In [32]:
feature_for_model2=df_ridge_coeff.sort_values('abs_coefficient',ascending=False).head(20)['features'].tolist()

In [33]:
df_ridge_coeff.sort_values('abs_coefficient',ascending=False).head(20)

Unnamed: 0,features,coefficient,abs_coefficient
22,gr_liv_area,12238.820523,12238.820523
32,total_sf,10430.062498,10430.062498
4,overall_qual,9883.549132,9883.549132
14,bsmtfin_sf_1,7365.789647,7365.789647
82,neighborhood_NridgHt,7142.794793,7142.794793
1,lot_area,6729.989421,6729.989421
8,exter_qual,5675.827043,5675.827043
88,neighborhood_StoneBr,5661.767532,5661.767532
25,kitchen_qual,5564.550609,5564.550609
6,year_built,5413.763822,5413.763822


In [34]:
print(feature_for_model2)

['gr_liv_area', 'total_sf', 'overall_qual', 'bsmtfin_sf_1', 'neighborhood_NridgHt', 'lot_area', 'exter_qual', 'neighborhood_StoneBr', 'kitchen_qual', 'year_built', 'total_bsmt_sf', 'bsmt_exposure', 'overall_cond', '1st_flr_sf', 'sale_type_New', 'exterior_1st_BrkFace', 'neighborhood_NoRidge', 'totrms_abvgrd', 'bldg_type_TwnhsE', 'bsmt_cond']


In [35]:
# test for 20 features

X=train[feature_for_model2]
y=train['saleprice']

In [36]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=15)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(1637, 20) (410, 20) (1637,) (410,)


In [37]:
ss=StandardScaler()
X_train_scale=ss.fit_transform(X_train)
X_test_scale=ss.transform(X_test)



In [38]:
#lasso method
lasso_search=LassoCV(n_alphas=300)
lasso_search.fit(X_train_scale,y_train)
lasso=Lasso(alpha=lasso_search.alpha_, tol=0.1)




In [39]:
score= cross_val_score(lasso,X_train_scale,y_train,cv=10)
lasso.fit(X_train_scale,y_train)

pred=lasso.predict(X_test_scale)
r2score_lasso_new=r2_score(y_test,pred)
rmse_lasso_new=np.sqrt(mean_squared_error(y_test,pred))

In [40]:
print('---Lasso---')
print(r2score_lasso_new)
print(rmse_lasso_new)

---Lasso---
0.8592987340184035
29009.758038459237


In [41]:
#ridge model
ridge_search=RidgeCV(alphas=np.logspace(0,5,200),store_cv_values=True)
ridge_search.fit(X_train_scale,y_train)

ridge=Ridge(alpha=ridge_search.alpha_)

In [42]:
score=cross_val_score(ridge,X_train_scale,y_train,cv=10)
ridge.fit(X_train_scale,y_train)

pred= ridge.predict(X_test_scale)
r2_score_new_ridge=r2_score(y_test,pred)
rmse_new_ridge=np.sqrt(mean_squared_error(y_test,pred))

In [43]:
print('---Ridge---')
print(r2_score_new_ridge)
print(rmse_new_ridge)

---Ridge---
0.8667139402621421
28234.978971784512


In [44]:
# try ridge for kaggle submission
X=train[feature_for_model2]
y=train['saleprice']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=50)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)




(1637, 20) (410, 20) (1637,) (410,)


In [45]:
ss=StandardScaler()

X_train_scaled=ss.fit_transform(X_train)
X_test_scaled=ss.transform(test[feature_for_model2])

In [46]:
ridge_search=RidgeCV(alphas=np.logspace(0,5,200),store_cv_values=True)
ridge_search.fit(X_train_scaled,y_train)
ridge=Ridge(alpha=ridge_search.alpha_)

In [47]:
score=cross_val_score(ridge,X_train_scaled,y_train,cv=10)
ridge.fit(X_train_scaled,y_train)

prediction=ridge.predict(X_test_scaled)

In [48]:
sub_df(prediction,'../datasets/ridge_sub_featured.csv')

Ridge model:
public:31534.15730
private:31990.36381

In [49]:
# try lasso for kaggle submission
X=train[feature_for_model2]
y=train['saleprice']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=50)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)



(1637, 20) (410, 20) (1637,) (410,)


In [50]:
ss=StandardScaler()
X_train_scaled=ss.fit_transform(X_train)
X_test_scaled=ss.transform(test[feature_for_model2])

In [51]:
lasso_search=LassoCV(n_alphas=300)
lasso_search.fit(X_train_scaled,y_train)
lasso=Lasso(alpha=lasso_search.alpha_, tol=0.1)

score=cross_val_score(lasso,X_train_scaled,y_train,cv=5)
lasso.fit(X_train_scaled,y_train)

prediction_lasso=lasso.predict(X_test_scaled)



In [52]:
sub_df(prediction_lasso,'../datasets/lasso_sub_featured.csv')

In [53]:
df_ridge_coeff.sort_values(by='coefficient',ascending=False).head(20)

Unnamed: 0,features,coefficient,abs_coefficient
22,gr_liv_area,12238.820523,12238.820523
32,total_sf,10430.062498,10430.062498
4,overall_qual,9883.549132,9883.549132
14,bsmtfin_sf_1,7365.789647,7365.789647
82,neighborhood_NridgHt,7142.794793,7142.794793
1,lot_area,6729.989421,6729.989421
8,exter_qual,5675.827043,5675.827043
88,neighborhood_StoneBr,5661.767532,5661.767532
25,kitchen_qual,5564.550609,5564.550609
6,year_built,5413.763822,5413.763822


lasso model
public:31377.89468
private:32009.04247