In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import r2_score,mean_squared_error,make_scorer
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
train=pd.read_csv('/house-prices-advanced-regression-techniques/train.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/house-prices-advanced-regression-techniques/train.csv'

In [None]:
print(train.shape)

Lets see the list of 20 variables having at least one null value

In [None]:
train.isnull().sum().sort_values(ascending=False)[0:20]

Since there are significant amount of data with null values in columns 'PoolQC','MiscFeature','Alley','Fence' and 'FireplaceQu', its better to remove those variables since they dont have significant influence in predicting saleprice.

In [None]:
train.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1,inplace=True)

In [None]:
train.isnull().sum().sort_values(ascending=False)[0:20]

# Outlier treatment

It is said that there are outliars in variable 'GrLivArea'.
[Documentation][1] for the Ames Housing Data indicates that there are outliers present in the training data [1]: http://ww2.amstat.org/publications/jse/v19n3/Decock/DataDocumentation.txt
Lets see

In [None]:
plt.scatter(x=train['GrLivArea'],y=train['SalePrice'])
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')
plt.show()

Four data points looks like outliars in the plot. But two points with high saleprice over 70000 seems to follow the linear trend. However, We can easily conclude that thwo 

In [None]:
train['GrLivArea'].sort_values(ascending=False)[:2]

In [None]:
train=train[(train['GrLivArea']<4500)]

# Data analysis

In [None]:
plt.scatter(x=train['GrLivArea'],y=train['SalePrice'])
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')
plt.show()

In [None]:
sns.distplot(train['SalePrice'])
print('skewness:',train['SalePrice'].skew())

We can easily conclude that ID has no significance in prediciting sale price. So we can delete them to avoid disturbance in our model.

In [None]:
train_id=train['Id']
train.drop('Id',axis=1,inplace=True)

Now lets do some research on some variables.

In [None]:
train.columns

Majority of the houses comes under the categories 20,60,50. In other words, most of the houses are 1-STORY 1946 & NEWER ALL STYLES, 2-STORY 1946 & NEWER and 1-1/2 STORY FINISHED ALL AGES respectively.

In [None]:
train['MSSubClass'].value_counts()


Majority of the houses are classified as Residential Low Density.


In [None]:
train['MSZoning'].value_counts()

Now lets see the relationship between Lot size and sale price.

In [None]:
fig,ax=plt.subplots(figsize=(9,11))
sns.scatterplot(ax=ax,x=train['LotArea'],y=train['SalePrice'])

Well. We cant see any significant relationship between Lo size and sale price. But we can say that majority of the houses has lot area in range between 0 and 25000 square feet. 

Lets analyse the densities within the categorical variables with respect to our target variable sale price.

In [None]:
fig,ax=plt.subplots(figsize=(9,11))
sns.violinplot(ax=ax,x=train['OverallQual'],y=train['SalePrice'])

Sale price increases linearly with overall quality which is obvious. We can expect to see similar trends quality and condition based variables later on.

In [None]:
fig,ax=plt.subplots(figsize=(9,11))
sns.violinplot(ax=ax,x=train['OverallCond'],y=train['SalePrice'])

This is surprising. Majority of the sale prices are listed similarly with respect to overall condtion of the house. Although there are few exceptions with higher saleprices for rating more than 5.

In [None]:
plt.scatter(data=train,x='YearBuilt',y='SalePrice')

We can see that the sale prices increases with the year built. Newer the house, higher the price.

Now lets display a pairplot by picking few numerical variables that seem important.

In [None]:
cols=train[['TotalBsmtSF','GrLivArea','TotRmsAbvGrd','YrSold','OverallQual','GarageArea','SalePrice']]
sns.pairplot(cols)

We can see the linear relationship with saleprice vs yearbuilt, total basement area,ground living area, total no of rooms above ground,overall quality,garage area. However, there is weak relationship with year sold.

# Treating skewed data

Now lets see the skewness of our target variable

In [None]:
sns.distplot(train['SalePrice'])
print('skewness before transforming:',train['SalePrice'].skew())

The saleprice is right skewed. Thus using log transformation would do the job.

In [None]:
train['SalePrice']=np.log1p(train['SalePrice'])
sns.distplot(train['SalePrice'])
print('skewness after transform:',train['SalePrice'].skew())

# Treating missing values

In [None]:
train.isnull().sum().sort_values(ascending=False)[:30]

Lets begin with lot frontage variable first.

Since we have to fit values that are most likely, we need to see how  it is distributed. I'm taking sale price as second variable to see how it is related with target variable.

In [None]:
plt.scatter(data=train,x='SalePrice',y='LotFrontage')
plt.xlabel('sale price')
plt.ylabel('lotfrontage')

This is the function I've used to impute where I have fitted the average value in certain ranges of sale prices.

In [None]:
def impute_lot(cols):
    l=cols[0]
    s=cols[1]
    if pd.isnull(l):
        if s<=11.0:
            l=train[train['SalePrice']<=11.0]['LotFrontage'].mean()
            return l
        elif 11.0<s<=12.5:
            l=train[(train['SalePrice']>11.0)&(train['SalePrice']<=12.5)]['LotFrontage'].mean()
            return l
        else:
            l=train[(train['SalePrice']>12.5)]['LotFrontage'].mean()
            return l
    else:
        return l
train['LotFrontage']=train[['LotFrontage','SalePrice']].apply(impute_lot,axis=1)

In [None]:
train.isnull().sum().sort_values(ascending=False)[:15]

Imputing garage year built variable

Assuming that garage year built has relationship with garage type, im plotting the scattterplot to check the distribution.

In [None]:
fig,ax=plt.subplots(figsize=(8,10))
sns.scatterplot(data=train,x='GarageYrBlt',y='SalePrice',hue='GarageType')
print(train['GarageType'].value_counts())

It looks like detached garage type has saleprice value lower than 12. Lets verify

In [None]:
print(train[train['SalePrice']>11.8]['GarageType'].value_counts())
print(train[train['SalePrice']<=11.8]['GarageType'].value_counts())

Since most of the values are either attached or detached, we can impute those two variables based on their respective sale price.

In [None]:
def impute_gtype(cols):
    g=cols[0]
    s=cols[1]
    if pd.isnull(g):
        if s<=11.8:
            g='Detchd'
            return g
        else:
            g='Attchd'
            return g
    else:
        return g
train['GarageType']=train[['GarageType','SalePrice']].apply(impute_gtype,axis=1)

In [None]:
train.isnull().sum().sort_values(ascending=False)[:15]

Its best to check highly correlated variable to the variable for which you're gonna impute. It is to ensure that we're imputing highly likely value.

In [None]:
corr_data=train.corr()
corr_data['GarageYrBlt'].sort_values(ascending=False)[:10]

In [None]:
sns.scatterplot(data=train,x='GarageYrBlt',y='YearBuilt')

If you look closely, year built and garage year built are same at most of the occasions.

In [None]:
sns.scatterplot(data=train,x='GarageYrBlt',y='YearBuilt')
plt.xlim(1900,)
plt.ylim(1900,)

yes it is. So lets just equate them and impute.

In [None]:
def impute_gyear(cols):
    g=cols[0]
    y=cols[1]
    if pd.isnull(g):
        g=y
        return g
    else:
        return g

In [None]:
train['GarageYrBlt']=train[['GarageYrBlt','YearBuilt']].apply(impute_gyear,axis=1)
train.isnull().sum().sort_values(ascending=False)[:15]

In [None]:
train['GarageFinish'].value_counts()

Well. All three values are distributed simliarly. We need to dig more to see pattern

In [None]:
fig,ax=plt.subplots(figsize=(8,10))
sns.scatterplot(data=train,x='GarageYrBlt',y='SalePrice',hue='GarageFinish')

Great. We can see clearly that most of the garages that are unfinished have lower sale price.

In [None]:
print(train[train['SalePrice']<12]['GarageFinish'].value_counts())
print(train[(train['SalePrice']>12)&(train['SalePrice']<=12.5)]['GarageFinish'].value_counts())
print(train[train['SalePrice']>12.5]['GarageFinish'].value_counts())

Lets fill mode values of these respective ranges of sale price.

In [None]:
def impute_gfinish(cols):
    g=cols[0]
    s=cols[1]
    if pd.isnull(g):
        if s<=12.0:
            g='Unf'
            return g
        elif (s>12.0)&(s<=12.5):
            g='RFn'
            return g
        else:
            g='Fin'
            return g
    else:
        return g

In [None]:
train['GarageFinish']=train[['GarageFinish','SalePrice']].apply(impute_gfinish,axis=1)
train.isnull().sum().sort_values(ascending=False)[:15]

In [None]:
train['GarageQual'].value_counts()

Easy peasy. We can safely impute all missing values as 'TA'.

In [None]:
train['GarageQual'].fillna('TA',inplace=True)

In [None]:
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['GarageCond'].value_counts()

Similar situation again

In [None]:
train['GarageCond'].fillna('TA',inplace=True)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['BsmtFinType2'].value_counts()

In [None]:
train['BsmtFinType2'].fillna('Unf',inplace=True)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['BsmtExposure'].value_counts()

In [None]:
train['BsmtExposure'].fillna('No',inplace=True)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['BsmtQual'].value_counts()

In [None]:
sns.violinplot(data=train,y='SalePrice',x='BsmtQual')

In [None]:
print(train[train['SalePrice']>12]['BsmtQual'].value_counts())
print(train[train['SalePrice']<=12]['BsmtQual'].value_counts())

In [None]:
def impute_bqual(cols):
    b=cols[0]
    s=cols[1]
    if pd.isnull(b):
        if b<=12.0:
            b='TA'
            return b
        else:
            b='Gd'
            return b
    else:
        return b

In [None]:
train['BsmtQual']=train[['BsmtQual','SalePrice']].apply(impute_bqual,axis=1)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['BsmtCond'].value_counts()

In [None]:
train['BsmtCond'].fillna('TA',inplace=True)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['BsmtFinType1'].value_counts()

In [None]:
plt.subplots(figsize=(8,9))
sns.violinplot(data=train,y='SalePrice',x='BsmtFinType1')

In [None]:
print(train[train['SalePrice']>12]['BsmtFinType1'].value_counts())
print(train[train['SalePrice']<=12]['BsmtFinType1'].value_counts())

In [None]:
def impute_bfin1(cols):
    b=cols[0]
    s=cols[1]
    if pd.isnull(b):
        if b<=12.0:
            b='Unf'
            return b
        else:
            b='GLQ'
            return b
    else:
        return b

In [None]:
train['BsmtFinType1']=train[['BsmtFinType1','SalePrice']].apply(impute_bfin1,axis=1)
train.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
train['MasVnrType'].fillna('None',inplace=True)
train['MasVnrArea'].fillna('None',inplace=True)
train['Electrical'].fillna('SBrkr',inplace=True)
train.isnull().sum().sort_values(ascending=False)[:10]

Cheers.
We have treated all the missing values in the dataset. Now lets have a quick look at correlation within dataset.

In [None]:
fig=plt.subplots(figsize=(9,8))
corr_data=train.corr()
sns.heatmap(corr_data)

Inorder to train the model, it is important to make sure we dont have any categorical variable in the dataset. Thus we will create dummies for all categorical variables.

In [None]:
train=pd.get_dummies(train)

In [None]:
train.shape

In [None]:
scorer=make_scorer(mean_squared_error,greater_is_better=False)

readying the train,test datasets before training.

In [None]:
y=train['SalePrice']
train.drop('SalePrice',axis=1,inplace=True)
x_train,x_test,y_train,true_p=train_test_split(train,y,test_size=0.2,random_state=120)

In [None]:
print(x_train.shape,y_train.shape,x_test.shape,true_p.shape)

In [None]:
def rmse_cv_train(model):
    rmse=np.sqrt(-cross_val_score(model,x_train,y_train,scoring=scorer,cv=10))
    return rmse
def rmse_cv_test(model):
    rmse=np.sqrt(-cross_val_score(model,x_test,true_p,scoring=scorer,cv=10))
    return rmse

In [None]:
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV

# Linear Regression

In [None]:
lreg=LinearRegression()
lreg.fit(x_train,y_train)
print('rmse value of train data:',rmse_cv_train(lreg).mean())
print('rmse value of test data:',rmse_cv_test(lreg).mean())

rmse value of train data looks weird here. Im not sure why. Lets see if that creates a problem later on.

In [None]:
train_pred=lreg.predict(x_train)
test_pred=lreg.predict(x_test)

In [None]:
plt.scatter(x=train_pred,y=train_pred-y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=test_pred-true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('residuals')
plt.title('linear regression')
plt.plot([10.0,13.5],[0.0,0.0],c='red')
plt.show()


plt.scatter(x=train_pred,y=y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('real values')
plt.title('linear regression')
plt.plot([11,13.25],[11,13.25],c='red')
plt.show()

print('accuracy:',r2_score(test_pred,true_p))

# Ridge regression

Accuracy looks pretty good though. Now lets start using regularised linear regression.

In [None]:
alphas=[0.01,0.03,0.07,0.1,0.3,0.6,1,3,5,7,10,30,60]
ridge=RidgeCV(alphas)
ridge.fit(x_train,y_train)
alpha=ridge.alpha_
print('best alpha:',alpha)
alphas=[alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4]
ridge=RidgeCV(alphas,cv=10)
ridge.fit(x_train,y_train)
alpha=ridge.alpha_
print('optimised alphas:',alpha)
print('rmse value of train data:',rmse_cv_train(ridge).mean())
print('rmse value of test data:',rmse_cv_test(ridge).mean())

Great!! rmse values of both train and test data have improved a lot.

In [None]:
train_pred=ridge.predict(x_train)
test_pred=ridge.predict(x_test)

In [None]:
plt.scatter(x=train_pred,y=train_pred-y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=test_pred-true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('residuals')
plt.title('ridge regression')
plt.plot([10.0,13.5],[0.0,0.0],c='red')
plt.show()


plt.scatter(x=train_pred,y=y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('real values')
plt.title('ridge regression')
plt.plot([11,13.25],[11,13.25],c='red')
plt.show()

print('accuracy:',r2_score(test_pred,true_p))

Now the accuracy has increased by roughly 3 percent. Lets move on to another regularised model lasso regression which eliminates insignificant variables and improve the accuracy.

# Lasso Regression

In [None]:
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1], 
                max_iter = 50000, cv = 10)
lasso.fit(x_train, y_train)
alpha = lasso.alpha_
print('best alpha:',alpha)
lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, 
                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, 
                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, 
                          alpha * 1.4], 
                max_iter = 50000, cv = 10)
lasso.fit(x_train, y_train)
alpha = lasso.alpha_
print('optimised alpha:',alpha)
print('rmse value of train data:',rmse_cv_train(lasso).mean())
print('rmse value of test data:',rmse_cv_test(lasso).mean())

As we can see, there is a very small improvement in rmse values of train and test data. Thus we can expect Lasso regression to improve out model minimally. Let's see

In [None]:
train_pred=lasso.predict(x_train)
test_pred=lasso.predict(x_test)

In [None]:
plt.scatter(x=train_pred,y=train_pred-y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=test_pred-true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('residuals')
plt.title('lasso regression')
plt.plot([10.0,13.5],[0.0,0.0],c='red')
plt.show()


plt.scatter(x=train_pred,y=y_train,c='blue',marker='s',label='train data')
plt.scatter(x=test_pred,y=true_p,c='green',marker='s',label='test data')
plt.xlabel('predicted values')
plt.ylabel('real values')
plt.title('lasso regression')
plt.plot([11,13.25],[11,13.25],c='red')
plt.show()

print('accuracy:',r2_score(test_pred,true_p))

As we expected, theres is a slight increase in accuracy in the model.