In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('realest.csv')
df.head()

Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
0,53.0,2.0,967.0,5.0,39.0,652.0,1.5,0.0,0.0
1,55.0,2.0,815.0,5.0,33.0,1000.0,1.0,2.0,1.0
2,56.0,3.0,900.0,5.0,35.0,897.0,1.5,1.0,0.0
3,58.0,3.0,1007.0,6.0,24.0,964.0,1.5,2.0,0.0
4,64.0,3.0,1100.0,7.0,50.0,1099.0,1.5,1.5,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      156 non-null    float64
 1   Bedroom    156 non-null    float64
 2   Space      146 non-null    float64
 3   Room       156 non-null    float64
 4   Lot        146 non-null    float64
 5   Tax        147 non-null    float64
 6   Bathroom   156 non-null    float64
 7   Garage     156 non-null    float64
 8   Condition  156 non-null    float64
dtypes: float64(9)
memory usage: 11.2 KB


In [5]:
df.describe()

Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
count,156.0,156.0,146.0,156.0,146.0,147.0,156.0,156.0,156.0
mean,56.474359,3.166667,1097.246575,6.5,32.808219,911.707483,1.480769,0.846154,0.230769
std,12.875307,1.348037,462.540698,1.675247,8.457859,443.26343,0.529408,0.808454,0.422682
min,32.0,1.0,539.0,4.0,24.0,418.0,1.0,0.0,0.0
25%,46.0,2.0,805.25,5.0,25.0,652.5,1.0,0.0,0.0
50%,55.0,3.0,965.5,6.0,30.0,821.0,1.5,1.0,0.0
75%,65.0,4.0,1220.5,7.0,37.0,1012.5,2.0,1.5,0.0
max,90.0,8.0,2295.0,12.0,50.0,2752.0,3.0,2.0,1.0


In [6]:
df.isna().sum()

Price         1
Bedroom       1
Space        11
Room          1
Lot          11
Tax          10
Bathroom      1
Garage        1
Condition     1
dtype: int64

In [7]:
#dropping single value using dropna
df_new = df.copy()
cols = ['Price','Bedroom','Room','Bathroom','Garage','Condition']
df_new = df_new.dropna(subset=cols)
df_new.isna().sum()

Price         0
Bedroom       0
Space        10
Room          0
Lot          10
Tax           9
Bathroom      0
Garage        0
Condition     0
dtype: int64

In [8]:
col = ['Space','Lot','Tax']
df_final = df_new.copy()
df_final[col] = df_final[col].fillna(-999)


In [9]:
df_final.isna().sum()

Price        0
Bedroom      0
Space        0
Room         0
Lot          0
Tax          0
Bathroom     0
Garage       0
Condition    0
dtype: int64

In [12]:
#finding covariance with Price column to understand the realtionship with other features in dataframe

price_corr = df_final.corr()['Price']
price_corr_sorted = price_corr.sort_values(ascending=False)
price_corr_sorted

Price        1.000000
Room         0.578480
Garage       0.553105
Bathroom     0.538485
Space        0.443076
Tax          0.395465
Bedroom      0.302451
Condition    0.079336
Lot          0.062433
Name: Price, dtype: float64

In [13]:
## We found out that Room,Garage,Bathroom & Space are the most correlated features to the traget variable as Prices of chicago houses.

In [14]:
## Train - test split & fiting linear regression model 
features = ['Room','Garage','Bathroom','Space','Tax','Bedroom','Condition','Lot']
X = df_final.loc[:,features]
y = df_final.loc[:,['Price']]


In [15]:
df_final_shuffle = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
split_ratio = 0.8
split_index = int(len(df_final_shuffle) * split_ratio)

X_train = X.iloc[:split_index,:]
X_test = X.iloc[:split_index,:]
y_train = y.iloc[:split_index,:]
y_test = y.iloc[:split_index,:]

# Display the shapes of the datasets
print("Training set X:", X_train.shape, "y:", y_train.shape)
print("Testing set X:", X_test.shape, "y:", y_test.shape)

Training set X: (124, 8) y: (124, 1)
Testing set X: (124, 8) y: (124, 1)


In [19]:
# fiting the model

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

residuals = y_test.values.flatten() - y_pred.flatten()

mse = np.mean(residuals ** 2)



Mean Squared Error (manual calculation with NumPy): 55.34


In [18]:
# Print the Mean Squared Error
print(f"Mean Squared Error (manual calculation): {mse:.2f}")

# Print predictions and actual values in a formatted way
print("\nPredictions vs Actual Values:")
print(f"{'Predicted Price':<20} {'Actual Price':<20} {'Difference':<20}")
print("-" * 60)

# Print each predicted value alongside the actual value and their difference
for predicted, actual in zip(y_pred.flatten(), y_test.values.flatten()):
    difference = predicted - actual
    print(f"{predicted:<20.2f} {actual:<20.2f} {difference:<20.2f}")

Mean Squared Error (manual calculation): 55.34

Predictions vs Actual Values:
Predicted Price      Actual Price         Difference          
------------------------------------------------------------
49.81                53.00                -3.19               
58.21                55.00                3.21                
51.94                56.00                -4.06               
62.13                58.00                4.13                
65.00                64.00                1.00                
60.27                44.00                16.27               
52.19                49.00                3.19                
71.36                70.00                1.36                
62.89                72.00                -9.11               
75.49                82.00                -6.51               
81.78                85.00                -3.22               
46.91                45.00                1.91                
46.18                47.00                