# Loading the dataset from Kaggle
***House Prices dataset***

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/content/gdrive/My Drive/RND4IMPACT'

In [8]:
%cd /content/gdrive/My Drive/RND4IMPACT

/content/gdrive/My Drive/RND4IMPACT


In [9]:
!ls

aisles.csv.zip					 products.csv.zip
data_description.txt				 sample_submission.csv
departments.csv.zip				 sample_submission.csv.zip
gender_submission.csv				 test.csv
house_price_model.sav				 test_split_titanic.csv
house_prices					 test_titanic.csv
house-prices-advanced-regression-techniques.zip  titanic_df_submission.csv
instacart-market-basket-analysis.zip		 titanic_model.sav
kaggle.json					 titanic.zip
order_products__prior.csv.zip			 train.csv
order_products__train.csv.zip			 train_titanic.csv
orders.csv.zip


In [10]:
!mkdir house_prices_folder

In [11]:
%cd house_prices_folder

/content/gdrive/MyDrive/RND4IMPACT/house_prices_folder


In [12]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content/gdrive/My Drive/RND4IMPACT/house_prices_folder
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 25.4MB/s]


In [13]:
!ls

house-prices-advanced-regression-techniques.zip


In [14]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [15]:
!ls

data_description.txt				 test.csv
house-prices-advanced-regression-techniques.zip  train.csv
sample_submission.csv


In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
import seaborn as sns

In [45]:
sample_df = pd.read_csv('sample_submission.csv')

In [46]:
sample_df.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [17]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Data wrangling

In [18]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [19]:
df[['Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle']].isnull().sum()

Street             0
Alley           1369
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [21]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [22]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(),4), '% missing values')

LotFrontage 0.1774 % missing values
Alley 0.9377 % missing values
MasVnrType 0.0055 % missing values
MasVnrArea 0.0055 % missing values
BsmtQual 0.0253 % missing values
BsmtCond 0.0253 % missing values
BsmtExposure 0.026 % missing values
BsmtFinType1 0.0253 % missing values
BsmtFinType2 0.026 % missing values
FireplaceQu 0.4726 % missing values
GarageType 0.0555 % missing values
GarageYrBlt 0.0555 % missing values
GarageFinish 0.0555 % missing values
GarageQual 0.0555 % missing values
GarageCond 0.0555 % missing values
PoolQC 0.9952 % missing values
Fence 0.8075 % missing values
MiscFeature 0.963 % missing values


# Data cleansing and wrangling

In [23]:
df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace = True)

In [24]:
df['Alley'].fillna(method='ffill', inplace=True)
df['MasVnrType'].fillna(method='ffill', inplace=True)
df['MasVnrArea'].fillna(method='ffill', inplace=True)
df['BsmtQual'].fillna(method='ffill', inplace=True)
df['BsmtCond'].fillna(method='ffill', inplace=True)
df['BsmtExposure'].fillna(method='ffill', inplace=True) 
df['BsmtFinType1'].fillna(method='ffill', inplace=True) 
df['BsmtFinType2'].fillna(method='ffill', inplace=True) 
df['FireplaceQu'].fillna(method='ffill', inplace=True) 
df['GarageType'].fillna(method='ffill', inplace=True)
df['GarageYrBlt'].fillna(method='ffill', inplace=True)
df['GarageFinish'].fillna(method='ffill', inplace=True) 
df['GarageQual'].fillna(method='ffill', inplace=True)
df['GarageCond'].fillna(method='ffill', inplace=True) 
df['PoolQC'].fillna(method='ffill', inplace=True) 
df['Fence'].fillna(method='ffill', inplace=True) 
df['MiscFeature'].fillna(method='ffill', inplace=True) 

In [25]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1]
for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(),4), '% missing values')

Alley 0.0144 % missing values
PoolQC 0.1349 % missing values
Fence 0.0034 % missing values
MiscFeature 0.0034 % missing values


In [26]:
for col in ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']:
    df[col] = pd.Categorical(df[col])
    df[col] = df[col].cat.codes

In [27]:
num_features = df[['LotFrontage','LotArea','1stFlrSF','GrLivArea','SalePrice','MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']]

# Model training

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [29]:
X = df[['LotFrontage','LotArea','1stFlrSF','GrLivArea','MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']]
y = df[['SalePrice']]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [30]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [31]:
model.score(X_train,y_train)

0.5818132443584558

In [32]:
model2 = DecisionTreeRegressor()
model2.fit(X_train,y_train)

DecisionTreeRegressor()

In [33]:
model2.score(X_train, y_train)

0.9997818004967488

In [34]:
import pickle
# save the model to disk
filename = 'house_price_model.sav'
pickle.dump(model2, open(filename, 'wb'))

In [49]:
test_hp_df = pd.read_csv('test.csv')

In [50]:
test_hp_df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [51]:
features_with_na=[features for features in test_hp_df.columns if test_hp_df[features].isnull().sum()>1]
for feature in features_with_na:
    print(feature, np.round(test_hp_df[feature].isnull().mean(),4), '% missing values')

MSZoning 0.0027 % missing values
LotFrontage 0.1556 % missing values
Alley 0.9267 % missing values
Utilities 0.0014 % missing values
MasVnrType 0.011 % missing values
MasVnrArea 0.0103 % missing values
BsmtQual 0.0302 % missing values
BsmtCond 0.0308 % missing values
BsmtExposure 0.0302 % missing values
BsmtFinType1 0.0288 % missing values
BsmtFinType2 0.0288 % missing values
BsmtFullBath 0.0014 % missing values
BsmtHalfBath 0.0014 % missing values
Functional 0.0014 % missing values
FireplaceQu 0.5003 % missing values
GarageType 0.0521 % missing values
GarageYrBlt 0.0535 % missing values
GarageFinish 0.0535 % missing values
GarageQual 0.0535 % missing values
GarageCond 0.0535 % missing values
PoolQC 0.9979 % missing values
Fence 0.8012 % missing values
MiscFeature 0.965 % missing values


In [52]:
test_hp_df['MSZoning'].fillna(method='ffill', inplace = True)
test_hp_df['LotFrontage'].fillna(test_hp_df['LotFrontage'].median(), inplace = True)
test_hp_df['Alley'].fillna(method='ffill', inplace=True)
test_hp_df['MasVnrType'].fillna(method='ffill', inplace=True)
test_hp_df['MasVnrArea'].fillna(method='ffill', inplace=True)
test_hp_df['BsmtQual'].fillna(method='ffill', inplace=True)
test_hp_df['BsmtCond'].fillna(method='ffill', inplace=True)
test_hp_df['BsmtExposure'].fillna(method='ffill', inplace=True) 
test_hp_df['BsmtFinType1'].fillna(method='ffill', inplace=True) 
test_hp_df['BsmtFinType2'].fillna(method='ffill', inplace=True) 
test_hp_df['FireplaceQu'].fillna(method='ffill', inplace=True) 
test_hp_df['GarageType'].fillna(method='ffill', inplace=True)
test_hp_df['GarageYrBlt'].fillna(method='ffill', inplace=True)
test_hp_df['GarageFinish'].fillna(method='ffill', inplace=True) 
test_hp_df['GarageQual'].fillna(method='ffill', inplace=True)
test_hp_df['GarageCond'].fillna(method='ffill', inplace=True) 
test_hp_df['PoolQC'].fillna(method='ffill', inplace=True) 
test_hp_df['Fence'].fillna(method='ffill', inplace=True) 
test_hp_df['MiscFeature'].fillna(method='ffill', inplace=True) 
test_hp_df['FireplaceQu'].fillna(method='ffill', inplace=True) 
test_hp_df['PoolQC'].fillna(method='ffill', inplace=True) 

In [53]:
features_with_na=[features for features in test_hp_df.columns if test_hp_df[features].isnull().sum()>1]
for feature in features_with_na:
    print(feature, np.round(test_hp_df[feature].isnull().mean(),4), '% missing values')

Alley 0.0247 % missing values
Utilities 0.0014 % missing values
BsmtFullBath 0.0014 % missing values
BsmtHalfBath 0.0014 % missing values
Functional 0.0014 % missing values
FireplaceQu 0.0014 % missing values
PoolQC 0.3523 % missing values


In [54]:
for col in ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']:
    test_hp_df[col] = pd.Categorical(test_hp_df[col])
    test_hp_df[col] = test_hp_df[col].cat.codes

In [55]:
test_hp_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,2,80.0,11622,1,,3,3,0,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,3,81.0,14267,1,,0,3,0,...,0,0,,MnPrv,Gar2,12500,6,2010,WD,Normal
2,1463,60,3,74.0,13830,1,,0,3,0,...,0,0,,MnPrv,Gar2,0,3,2010,WD,Normal
3,1464,60,3,78.0,9978,1,,0,3,0,...,0,0,,MnPrv,Gar2,0,6,2010,WD,Normal
4,1465,120,3,43.0,5005,1,,0,1,0,...,144,0,,MnPrv,Gar2,0,1,2010,WD,Normal


In [56]:
test_hp_df = test_hp_df[['Id','LotFrontage','LotArea','1stFlrSF','GrLivArea','MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']]

In [57]:
test_hp_df.head()

Unnamed: 0,Id,LotFrontage,LotArea,1stFlrSF,GrLivArea,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope
0,1461,80.0,11622,896,896,2,1,3,3,0,4,0
1,1462,81.0,14267,1329,1329,3,1,0,3,0,0,0
2,1463,74.0,13830,928,1629,3,1,0,3,0,4,0
3,1464,78.0,9978,926,1604,3,1,0,3,0,4,0
4,1465,43.0,5005,1280,1280,3,1,0,1,0,4,0


In [58]:
X = StandardScaler().fit_transform(test_hp_df)

In [59]:
hp_preds = model2.predict(test_hp_df[['LotFrontage','LotArea','1stFlrSF','GrLivArea','MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']])

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [64]:
hp_preds_linear = model2.predict(test_hp_df[['LotFrontage','LotArea','1stFlrSF','GrLivArea','MSZoning','Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope']])

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [60]:
test_hp_df['SalePrice'] = hp_preds
test_hp_df.head(5)

Unnamed: 0,Id,LotFrontage,LotArea,1stFlrSF,GrLivArea,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,SalePrice
0,1461,80.0,11622,896,896,2,1,3,3,0,4,0,184750.0
1,1462,81.0,14267,1329,1329,3,1,0,3,0,0,0,184750.0
2,1463,74.0,13830,928,1629,3,1,0,3,0,4,0,184750.0
3,1464,78.0,9978,926,1604,3,1,0,3,0,4,0,184750.0
4,1465,43.0,5005,1280,1280,3,1,0,1,0,4,0,184750.0


In [65]:
test_hp_df['SalePrice'] = hp_preds_linear
test_hp_df.head(5)

Unnamed: 0,Id,LotFrontage,LotArea,1stFlrSF,GrLivArea,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,SalePrice
0,1461,80.0,11622,896,896,2,1,3,3,0,4,0,184750.0
1,1462,81.0,14267,1329,1329,3,1,0,3,0,0,0,184750.0
2,1463,74.0,13830,928,1629,3,1,0,3,0,4,0,184750.0
3,1464,78.0,9978,926,1604,3,1,0,3,0,4,0,184750.0
4,1465,43.0,5005,1280,1280,3,1,0,1,0,4,0,184750.0


In [67]:
hp_preds_linear

array([184750., 184750., 184750., ..., 184750., 184750., 184750.])

In [66]:
test_hp_df.head(1)

Unnamed: 0,Id,LotFrontage,LotArea,1stFlrSF,GrLivArea,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,SalePrice
0,1461,80.0,11622,896,896,2,1,3,3,0,4,0,184750.0


In [62]:
sample_df.head(1)

Unnamed: 0,Id,SalePrice
0,1461,169277.052498


In [63]:
house_price_submission = test_hp_df[['Id','SalePrice']]
house_price_submission.head(5)

Unnamed: 0,Id,SalePrice
0,1461,184750.0
1,1462,184750.0
2,1463,184750.0
3,1464,184750.0
4,1465,184750.0
