In [216]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score, KFold

In [217]:
############# Load dataset ##############
house_ds = pd.read_csv("Housing.csv")
house_ds.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [218]:
############# Dataset info ##############
house_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [219]:
############# Check if there is big variance in this col [ price / area] ##############
house_ds[["price","area"]].var()

price    3.498544e+12
area     4.709512e+06
dtype: float64

In [220]:
############# Label Encoder ##############
le = LabelEncoder()
house_ds["guestroom_binary"] = le.fit_transform(house_ds["guestroom"])
house_ds["mainroad_binary"] = le.fit_transform(house_ds["mainroad"])
house_ds["basement_binary"] = le.fit_transform(house_ds["basement"])
house_ds["hotwaterheating_binary"] = le.fit_transform(house_ds["hotwaterheating"])
house_ds["airconditioning_binary"] = le.fit_transform(house_ds["airconditioning"])
house_ds["prefarea_binary"] = le.fit_transform(house_ds["prefarea"])

In [221]:
############# Drop the old col ##############
house_ds=house_ds.drop(["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea"], axis=1)

In [222]:
############# check them ##############
house_ds.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus,guestroom_binary,mainroad_binary,basement_binary,hotwaterheating_binary,airconditioning_binary,prefarea_binary
0,13300000,7420,4,2,3,2,furnished,0,1,0,0,1,1
1,12250000,8960,4,4,4,3,furnished,0,1,0,0,1,0
2,12250000,9960,3,2,2,2,semi-furnished,0,1,1,0,0,1
3,12215000,7500,4,2,2,3,furnished,0,1,1,0,1,1
4,11410000,7420,4,1,2,2,furnished,1,1,1,0,1,0
5,10850000,7500,3,3,1,2,semi-furnished,0,1,1,0,1,1
6,10150000,8580,4,3,4,2,semi-furnished,0,1,0,0,1,1
7,10150000,16200,5,3,2,0,unfurnished,0,1,0,0,0,0
8,9870000,8100,4,1,2,2,furnished,1,1,1,0,1,1
9,9800000,5750,3,2,4,1,unfurnished,1,1,0,0,1,1


In [223]:
############# make dummy data for " furnishingstatus " col (multi value) ##############
house_ds=house_ds.join(pd.get_dummies(house_ds["furnishingstatus"], dtype=int))

In [224]:
# drop the old one
house_ds=house_ds.drop("furnishingstatus",axis=1)

In [225]:
# Final datset
house_ds.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,guestroom_binary,mainroad_binary,basement_binary,hotwaterheating_binary,airconditioning_binary,prefarea_binary,furnished,semi-furnished,unfurnished
0,13300000,7420,4,2,3,2,0,1,0,0,1,1,1,0,0
1,12250000,8960,4,4,4,3,0,1,0,0,1,0,1,0,0
2,12250000,9960,3,2,2,2,0,1,1,0,0,1,0,1,0
3,12215000,7500,4,2,2,3,0,1,1,0,1,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,1,0,0
5,10850000,7500,3,3,1,2,0,1,1,0,1,1,0,1,0
6,10150000,8580,4,3,4,2,0,1,0,0,1,1,0,1,0
7,10150000,16200,5,3,2,0,0,1,0,0,0,0,0,0,1
8,9870000,8100,4,1,2,2,1,1,1,0,1,1,1,0,0
9,9800000,5750,3,2,4,1,1,1,0,0,1,1,0,0,1


In [226]:
# make the X and Y for the Model
X  = house_ds.drop("price",axis=1)
y = house_ds["price"]
# split the data for trainning and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [227]:
# Fit the model
reg= LinearRegression()
reg.fit(X_train,y_train)
reg_score = reg.score(X_test,y_test)
print(f"Linear Regression score : {round(reg_score, 2)}")

Linear Regression score : 0.66


In [228]:
# Cross validation
kf = KFold(n_splits=6, shuffle=True, random_state=42)
cv_results = cross_val_score(reg, X_train, y_train, cv=kf)
print("Cross-validation Results : ", np.array(cv_results))
print("Cross-validation mean : ", round(np.mean(cv_results), 2))
print("Cross-validation highest score : ", round(np.max(cv_results), 2))

Cross-validation Results :  [0.61387507 0.7071425  0.57283541 0.71764229 0.63991889 0.67314766]
Cross-validation mean :  0.65
Cross-validation highest score :  0.72


In [229]:
y_pred=reg.predict(X_test)
mse_knn= MSE(y_test, y_pred)
rmse_knn = mse_knn**(1/2)
print(rmse_knn)

1245949.129409
