# House Price Prediction

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

import plotly.express as px
import plotly.figure_factory as ff

In [2]:
df = pd.read_csv("data/houseprice.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.shape

(1460, 81)

In [5]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
missing_value = pd.DataFrame(df.isnull().sum().sort_values(ascending=False), columns = ["missing_count"])

missing_value["missing_percent"] = missing_value["missing_count"].apply(lambda x : round(float(x) * 100/df.shape[0], 2))

missing_value = missing_value[missing_value["missing_count"] > 0]

missing_value

Unnamed: 0,missing_count,missing_percent
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
MasVnrType,872,59.73
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageYrBlt,81,5.55
GarageCond,81,5.55
GarageType,81,5.55


In [8]:
missing_value[missing_value["missing_percent"] > 40]

Unnamed: 0,missing_count,missing_percent
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
MasVnrType,872,59.73
FireplaceQu,690,47.26


In [9]:
#drop columns with high missing values

df = df.drop(["PoolQC", "MiscFeature", "Alley", "Fence", "MasVnrType", "FireplaceQu"], axis=1)

In [10]:
#drop rows with missing values

df.dropna(inplace=True)

In [11]:
df.shape

(1094, 75)

In [12]:
fig = px.histogram(df["SalePrice"], x = "SalePrice", title = "SalePrice Distribution")
fig.show()

In [23]:
df["log_sale_price"] = np.log(df["SalePrice"])
df.drop(["SalePrice"], axis = 1, inplace = True)

In [24]:
fig = px.histogram(df["log_sale_price"], x = "log_sale_price", title = "SalePrice Log Distribution")
fig.show()

In [27]:
y = df["log_sale_price"]

df_num = df.select_dtypes(include=["int64", "float64"])

X = df_num.drop(["log_sale_price"], axis = 1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 3)

In [30]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [31]:
y_pred = lr_model.predict(X_test)

In [32]:
accuracy_score = lr_model.score(X_test, y_test)
print("Linear regression model accuracy : ", accuracy_score)

Linear regression model accuracy :  0.8866637114738749


In [33]:
lr_cv = cross_val_score(lr_model, X, y, cv = 5, scoring="r2")
print("Cross validation result : ", lr_cv)
print("R2 : ", lr_cv.mean() )

Cross validation result :  [0.88426462 0.83605032 0.86145344 0.89201551 0.6154792 ]
R2 :  0.8178526186867124
