In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score

In [11]:
data = pd.read_csv('ParisHousing.csv')

In [13]:
data.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [15]:
data.shape

(10000, 17)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   squareMeters       10000 non-null  int64  
 1   numberOfRooms      10000 non-null  int64  
 2   hasYard            10000 non-null  int64  
 3   hasPool            10000 non-null  int64  
 4   floors             10000 non-null  int64  
 5   cityCode           10000 non-null  int64  
 6   cityPartRange      10000 non-null  int64  
 7   numPrevOwners      10000 non-null  int64  
 8   made               10000 non-null  int64  
 9   isNewBuilt         10000 non-null  int64  
 10  hasStormProtector  10000 non-null  int64  
 11  basement           10000 non-null  int64  
 12  attic              10000 non-null  int64  
 13  garage             10000 non-null  int64  
 14  hasStorageRoom     10000 non-null  int64  
 15  hasGuestRoom       10000 non-null  int64  
 16  price              1000

In [40]:
data.isnull().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
dtype: int64

In [42]:
data.duplicated().sum()

0

In [19]:
X = data.drop(columns=['price'])
Y = data['price']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=2, test_size=0.2)

X_train.shape, X_test.shape

((8000, 16), (2000, 16))

# Linear Regression

In [56]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("R2_score: ", r2_score(y_test, y_pred))

R2_score:  0.99999960149123


# Lasso Regression

In [32]:
L1 = Lasso(alpha=0.1)
L1.fit(X_train, y_train)
y_pred2 = L1.predict(X_test)
print("R2_score: ", r2_score(y_test, y_pred2))

R2_score:  0.9999996014889334


# Ridge Regression

In [35]:
R1 = Ridge(alpha=0.1)
R1.fit(X_train, y_train)
y_pred3 = R1.predict(X_test)
print("R2_score: ", r2_score(y_test, y_pred3))

R2_score:  0.9999996014906777


# ElasticNet

In [38]:
E1 = ElasticNet(alpha=0.1, l1_ratio=0.5)
E1.fit(X_train, y_train)
y_pred4 = E1.predict(X_test)
print("R2_score: ", r2_score(y_test, y_pred4))

R2_score:  0.9999995854955284


In [44]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(reg, X, Y, cv=5, scoring='r2')  # 5-fold CV
print("Cross-validation scores:", scores)
print("Average score:", scores.mean())

Cross-validation scores: [0.99999953 0.99999956 0.99999959 0.99999958 0.99999956]
Average score: 0.9999995636099099


In [46]:
scores = cross_val_score(E1, X, Y, cv=5, scoring='r2')  # 5-fold CV
print("Cross-validation scores:", scores)
print("Average score:", scores.mean())

Cross-validation scores: [0.99999951 0.99999955 0.99999957 0.99999956 0.99999955]
Average score: 0.9999995485914426
