In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
df = pd.read_csv("../res/houseprice.csv")
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [15]:
label_encoder = LabelEncoder()
cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

for col in cols:
    df[col] = label_encoder.fit_transform(df[col])

df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,2
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,2
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0


In [16]:
x = df.drop("price", axis = 1)
x

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,0
4,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,1,0,1,0,0,2,0,2
541,2400,3,1,1,0,0,0,0,0,0,0,1
542,3620,2,1,1,1,0,0,0,0,0,0,2
543,2910,3,1,1,0,0,0,0,0,0,0,0


In [17]:
y = df["price"]
y

0      13300000
1      12250000
2      12250000
3      12215000
4      11410000
         ...   
540     1820000
541     1767150
542     1750000
543     1750000
544     1750000
Name: price, Length: 545, dtype: int64

In [18]:
scaler = RobustScaler()
x = scaler.fit_transform(x)
x

array([[ 1.02173913,  1.        ,  1.        , ...,  2.        ,
         1.        , -0.5       ],
       [ 1.57971014,  1.        ,  3.        , ...,  3.        ,
         0.        , -0.5       ],
       [ 1.94202899,  0.        ,  1.        , ...,  2.        ,
         1.        ,  0.        ],
       ...,
       [-0.35507246, -1.        ,  0.        , ...,  0.        ,
         0.        ,  0.5       ],
       [-0.61231884,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.5       ],
       [-0.27173913,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.5       ]])

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 40)

In [20]:
model = LinearRegression()
model.fit(x_train, y_train)

In [21]:
y_pred = model.predict(x_test)
y_pred

array([7154236.434521  , 6473033.12762162, 7967353.36422651,
       3250053.54966786, 5041218.60579336, 2976073.5609929 ,
       4937665.82889842, 3883826.62741169, 6268804.63250383,
       2907287.6074124 , 6460451.13918714, 3286358.39831609,
       3275477.64306414, 3065101.06146755, 7948917.26201535,
       2492806.62448377, 5254217.74779315, 7418086.14841512,
       4264026.94795389, 2729071.62316378, 3403605.7563744 ,
       3132342.46355617, 3394199.53426823, 5598580.84996309,
       2822869.50961042, 2271685.66727904, 4274338.37067982,
       5116740.00667758, 3508335.2154458 , 3885646.47445228,
       4251680.67490509, 6440889.92804155, 6927347.98608497,
       3227734.71928694, 6871357.8712789 , 3422524.70343752,
       3390125.08812476, 5617687.34840424, 3919707.55979289,
       2934616.32414119, 5824539.15538491, 4501489.1744983 ,
       7116285.52783017, 5881343.20169433, 6671920.72585857,
       5159462.03470177, 3285881.50887597, 6621470.53673986,
       6235750.13830626,

In [22]:
coefficients = model.coef_
intercept = model.intercept_

In [23]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 1466037860959.4512
Root Mean Squared Error (RMSE): 1210800.5041952415
R-squared (R2): 0.6591408850872544
