In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
house_data = pd.read_csv('/content/Housing.csv')
house_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
house_data.shape

(545, 13)

Normalizing all numeric fields in the dataset.


In [None]:
columns_to_norm = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [None]:
house_data_norm = house_data.copy() # Deep copy to perserve data.
house_data_norm[columns_to_norm] = normalize(house_data_norm[columns_to_norm], norm='l2', axis=0)
house_data_norm.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,0.111273,0.056876,0.056077,0.062047,0.064165,yes,no,no,no,yes,0.077498,yes,furnished
1,0.102488,0.06868,0.056077,0.124094,0.085553,yes,no,no,no,yes,0.116248,no,furnished
2,0.102488,0.076345,0.042058,0.062047,0.042776,yes,no,yes,no,no,0.077498,yes,semi-furnished
3,0.102195,0.057489,0.056077,0.062047,0.042776,yes,no,yes,no,yes,0.116248,yes,furnished
4,0.09546,0.056876,0.056077,0.031024,0.042776,yes,yes,yes,no,yes,0.077498,no,furnished


In [None]:
# One-hot encoding categorical features.
house_data_norm_encoded = pd.get_dummies(house_data_norm)
house_data_norm_encoded.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.111273,0.056876,0.056077,0.062047,0.064165,0.077498,False,True,True,False,True,False,True,False,False,True,False,True,True,False,False
1,0.102488,0.06868,0.056077,0.124094,0.085553,0.116248,False,True,True,False,True,False,True,False,False,True,True,False,True,False,False
2,0.102488,0.076345,0.042058,0.062047,0.042776,0.077498,False,True,True,False,False,True,True,False,True,False,False,True,False,True,False
3,0.102195,0.057489,0.056077,0.062047,0.042776,0.116248,False,True,True,False,False,True,True,False,False,True,False,True,True,False,False
4,0.09546,0.056876,0.056077,0.031024,0.042776,0.077498,False,True,False,True,False,True,True,False,False,True,True,False,True,False,False


In [None]:
# Casting bool to int for linear regression.
house_data_norm_encoded.iloc[:, 6:21] = house_data_norm_encoded.iloc[:, 6:21].astype(np.int64)
house_data_norm_encoded.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.111273,0.056876,0.056077,0.062047,0.064165,0.077498,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0
1,0.102488,0.06868,0.056077,0.124094,0.085553,0.116248,0,1,1,0,1,0,1,0,0,1,1,0,1,0,0
2,0.102488,0.076345,0.042058,0.062047,0.042776,0.077498,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0
3,0.102195,0.057489,0.056077,0.062047,0.042776,0.116248,0,1,1,0,0,1,1,0,0,1,0,1,1,0,0
4,0.09546,0.056876,0.056077,0.031024,0.042776,0.077498,0,1,0,1,0,1,1,0,0,1,1,0,1,0,0


Ran across an error when running .astype(int) to convert bool to int so we can run linear regression.

"dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first."

found out that by changing the dtype from int to np.int64 fixed this. Honestly I have no idea why that worked. The documentation for .astype() was mentioning explicit and implicit casting and np.int64 was the equivalent dtype of int for explicit typecasting so I tried np.int64 as the input for .astype() and it worked.

So I'm guessing that numpy's int64 is needed for the pandas .astype() function rather than python's int type.


Edit: Even though it has been "fixed" as in it works, I still get the error the first time I run the astype function. From my research it looks like numpy types are a compatible dtype. I don't think I understand what is going on. But, it works for now.

In [None]:
# Splitting features from targets and getting train and test datasets.
house_data_X = house_data_norm_encoded.drop(columns='price', axis=1)
house_data_Y = house_data_norm_encoded['price']
house_data_X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.056876,0.056077,0.062047,0.064165,0.077498,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0
1,0.06868,0.056077,0.124094,0.085553,0.116248,0,1,1,0,1,0,1,0,0,1,1,0,1,0,0
2,0.076345,0.042058,0.062047,0.042776,0.077498,0,1,1,0,0,1,1,0,1,0,0,1,0,1,0
3,0.057489,0.056077,0.062047,0.042776,0.116248,0,1,1,0,0,1,1,0,0,1,0,1,1,0,0
4,0.056876,0.056077,0.031024,0.042776,0.077498,0,1,0,1,0,1,1,0,0,1,1,0,1,0,0


In [None]:
house_data_Y.head()

Unnamed: 0,price
0,0.111273
1,0.102488
2,0.102488
3,0.102195
4,0.09546


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(house_data_X, house_data_Y, test_size=0.2, random_state=2)

In [None]:
print(X_train.shape, X_test.shape)

(436, 20) (109, 20)


In [None]:
print(Y_train.shape, Y_test.shape)

(436,) (109,)


In [None]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

In [None]:
r2_train = r2_score(Y_train, Y_train_pred)
r2_test = r2_score(Y_test, Y_test_pred)
mse_train = mean_squared_error(Y_train, Y_train_pred)
mse_test = mean_squared_error(Y_test, Y_test_pred)

In [None]:
print("r2_train: ", r2_train)
print("r2_test: ", r2_test)
print("mse_train: ", mse_train)
print("mse_test: ", mse_test)

r2_train:  0.6918324307049805
r2_test:  0.629851860887223
mse_train:  7.491229594870638e-05
mse_test:  9.195254538603205e-05
