In [3]:
# In this notebook we will try to solve the prolem of overfitting by using L1 and L2 regularization technique.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge


In [4]:
df = pd.read_csv(r'C:\Users\Utente\Documents\GitHub\Magaldi_DepositoCorso\CORSO_PYTHON_Itconsulting\03-12-2025\Melbourne_housing.csv'
)
print("Shape iniziale:", df.shape)
print(df.nunique())

Shape iniziale: (34857, 22)
Suburb             351
Address          34009
Rooms               12
Type                 3
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom             15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       994
YearBuilt          160
CouncilArea         33
Latitude         13402
Longtitude       14524
Regionname           8
Propertycount      342
ParkingArea          8
Price             2871
dtype: int64


  df = pd.read_csv(r'C:\Users\Utente\Documents\GitHub\Magaldi_DepositoCorso\CORSO_PYTHON_Itconsulting\03-12-2025\Melbourne_housing.csv'


In [5]:
# now lets observe our data and try to drop some columns that are not very uselful in our data analysis. I am just dropping some columns
# like date, latitute etc that are not very meaningful for our analysis. 
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
df_new = df[columns_to_use].copy()
df_new.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,inf,
1,Airport West,3,t,PI,Nelson,Western Metropolitan,3464.0,13.5,Moonee Valley City Council,3.0,2.0,1.0,303.0,225.0,840000.0
2,Albert Park,2,h,S,hockingstuart,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,120.0,82.0,1275000.0
3,Albert Park,2,h,S,Thomson,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,159.0,inf,1455000.0
4,Alphington,3,h,SN,McGrath,Northern Metropolitan,2211.0,6.4,Darebin City Council,3.0,2.0,1.0,174.0,122.0,


In [6]:
# now lets do some data cleaning
df_new.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           0
Propertycount        3
Distance             1
CouncilArea          3
Bedroom           8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21097
Price             7610
dtype: int64

In [7]:
# so we have several columns with NaN values so we need to handle these columns. We can actually fill some of these column's NaN 
# values just by 0 and some other columns might need some other treatment based on their nature for example price.
# lets first handle the columns where we need to fill only 0.
columns_to_fill_0 = ['Car', 'Bathroom', 'Bedroom', 'Distance', 'Propertycount']
df_new[columns_to_fill_0] = df_new[columns_to_fill_0].fillna(0)
df_new.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,inf,
1,Airport West,3,t,PI,Nelson,Western Metropolitan,3464.0,13.5,Moonee Valley City Council,3.0,2.0,1.0,303.0,225.0,840000.0
2,Albert Park,2,h,S,hockingstuart,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,120.0,82.0,1275000.0
3,Albert Park,2,h,S,Thomson,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,159.0,inf,1455000.0
4,Alphington,3,h,SN,McGrath,Northern Metropolitan,2211.0,6.4,Darebin City Council,3.0,2.0,1.0,174.0,122.0,


In [8]:
# Now lets fill the columns named landsize and building area with mean of the whole respective columns
df_new['Landsize'] = pd.to_numeric(df_new['Landsize'], errors='coerce')
df_new['BuildingArea'] = pd.to_numeric(df_new['BuildingArea'], errors='coerce')
df_new['Landsize'].fillna(df_new['Landsize'].mean(), inplace=True)
df_new['BuildingArea'].fillna(df_new['BuildingArea'].mean(), inplace=True)
df_new = df_new.replace([np.inf, -np.inf], np.nan).dropna()
print("Valori mancanti dopo pulizia:")
print(df_new.isna().sum())
# in the buildingArea column there are some infitly large valuea and the model was not training because of that that is why 
# I had to come back here and drop those inf values as well.


Valori mancanti dopo pulizia:
Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom          0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64


In [9]:
# now we are good to go with out cleaned data. Now we are going to make dummy variables for our whole dataset.
df_new = pd.get_dummies(df_new, drop_first=True)
print("Shape after one-hot encoding:", df_new.shape)

Shape after one-hot encoding: (10656, 644)


In [10]:
# Now we can jump into our machine learning model and lets first use the train_test_split method
X = df_new.drop('Price', axis=1)
y = df_new['Price']
x_train, x_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=2
)
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Train set X:", x_train.shape)
print("Train set y:", y_train.shape)
print("Test set X:", x_test.shape)
print("Test set y:", y_test.shape)


Shape of X: (10656, 643)
Shape of y: (10656,)
Train set X: (8524, 643)
Train set y: (8524,)
Test set X: (2132, 643)
Test set y: (2132,)


In [11]:
model = LinearRegression()
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
# so we can see that our model is facing the problem of overfitting because on training dataset it scores higher and on the
# testing dataset it score lower. In simple words our model is overfit to the training dataset and underfit to the testing dataset.
# We can solve the problem of overfitting by using L1 0r L2 regularization.  
lasso_model = Lasso()
lasso_model.fit(x_train, y_train)

# the L1 regularization or the Lasso model will add an absolute θ value in the mean squared error

  model = cd_fast.enet_coordinate_descent(


0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [13]:
# Now we will use the L2 regularization tehnique
ridge_model = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_model.fit(x_train, y_train)

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,100
,tol,0.1
,solver,'auto'
,positive,False
,random_state,


In [14]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge

from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold


model = Ridge(alpha=1, random_state=42)

# --- 3.K-Fold ---
# Vogliamo 5 round di validazione.
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 4. Esecuzione della Cross-Validation ---
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')

# --- Conversione in Positivo e Calcolo RMSE ---
mse_scores = -scores # Togliamo il segno meno
rmse_scores = np.sqrt(mse_scores) # Facciamo la radice quadrata per avere l'errore

print("\n--- Risultati Cross-Validation ---")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse:,.0f} | RMSE = {np.sqrt(mse):,.0f}")

print("-" * 40)
print(f"MSE Medio: {mse_scores.mean():,.0f}")
print(f"RMSE Medio: {rmse_scores.mean():,.0f}")
print(f"Stabilità (Std RMSE): +/- {rmse_scores.std():,.0f}")


--- Risultati Cross-Validation ---
Fold 1: MSE = 262,937,464,621 | RMSE = 512,774
Fold 2: MSE = 143,288,855,884 | RMSE = 378,535
Fold 3: MSE = 152,451,195,569 | RMSE = 390,450
Fold 4: MSE = 170,047,353,448 | RMSE = 412,368
Fold 5: MSE = 112,037,291,415 | RMSE = 334,720
----------------------------------------
MSE Medio: 168,152,432,187
RMSE Medio: 405,769
Stabilità (Std RMSE): +/- 59,191
