# In this notebook we will try to solve the prolem of overfitting by using L1 and L2 regularization technique.

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression


In [None]:
df = pd.read_csv('Melbourne_housing.csv')
df

# Numero di valori nulli per ogni colonna

In [None]:
print(df.isnull().sum())

# PULIZIA

In [None]:
# Elimino le colonne con troppi valori nulli
df = df.drop(columns=["BuildingArea", "YearBuilt"])

# Calcolo la media delle colonne numeriche
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns

# Riempio i NaN delle colonne numeriche con la media
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

print(df.isnull().sum())

Le colonne non numeriche (come CouncilArea, ParkingArea, ecc.) non vengono toccate -> mantengono eventuali NaN

In [None]:
# Sostituisco stringhe vuote "" con NaN (altrimenti dropna non le vede)
df = df.replace("", np.nan)

# Elimino le righe dove qualsiasi colonna stringa è vuota
string_cols = df.select_dtypes(include=["object"]).columns

df = df.dropna(subset=string_cols, how="any")

print(df.isnull().sum())

In [None]:
df.nunique()

In [None]:
df.shape

## Colonne che usiamo

In [None]:
# now lets observe our data and try to drop some columns that are not very uselful in our data analysis. I am just dropping some columns
# like date, latitute etc that are not very meaningful for our analysis. 
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom', 'Bathroom', 'Car', 'Landsize', 'Price', 'ParkingArea']
df_new = df[columns_to_use]
df_new

In [None]:
df_new.shape

## Variabili dummy

In [None]:
# now we are good to go with out cleaned data. Now we are going to make dummy variables for our whole dataset.
df_new = pd.get_dummies(df_new, drop_first=True) # it is a short cut to avoid dummy variable trap it is just dropping the main column whose dummies we have produced. 
df_new

# MACHINE LEARNING MODEL

In [None]:
X = df_new.drop('Price', axis='columns')
y = df_new.Price

## Split

In [None]:
# Now we can jump into our machine learning model and lets first use the train_test_split method
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

## LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error
y_pred_lin = model.predict(X_test)

rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
r2_lin = model.score(X_test, y_test)
print(f"TEST - [Linear Regr.] RMSE: {rmse_lin:.4f} | R2: {r2_lin:.4f}")
# Our model is much overfit with the training dataset that its accuracy in negative when we provide it with testing dataset. 

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_lin = model.predict(X_train)

mse_lin = np.sqrt(mean_squared_error(y_train, y_pred_lin))
r2_lin = model.score(X_train, y_train) # at the same our model is performing very well with respect to the training datset
print(f"TRAIN - [Linear Regr.] RMSE: {rmse_lin:.4f} | R2: {r2_lin:.4f}")

## Lasso

In [None]:
# so we can see that our model is facing the problem of overfitting because on training dataset it scores higher and on the
# testing dataset it score lower. In simple words our model is overfit to the training dataset and underfit to the testing dataset.
# We can solve the problem of overfitting by using L1 0r L2 regularization.  
from sklearn.linear_model import Lasso    # Sklearn's Lass regression is the L1 regularization. 
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
# the L1 regularization or the Lasso model will add an absolute θ value in the mean squared error

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_lasso = lasso_model.predict(X_test)

rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = lasso_model.score(X_test, y_test)
print(f"TEST - [Lasso Regr.] RMSE: {rmse_lasso:.4f} | R2: {r2_lasso:.4f}")
# We can see that from -48 percent score to 70 percent score our model is much bette now after using L1 regularization.

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_lasso = lasso_model.predict(X_train)

rmse_lasso = np.sqrt(mean_squared_error(y_train, y_pred_lasso))
r2_lasso = lasso_model.score(X_train, y_train)
print(f"TRAIN - [Lasso Regr.] RMSE: {rmse_lasso:.4f} | R2: {r2_lasso:.4f}")

## Ridge

In [None]:
# Now we will use the L2 regularization tehnique
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_ridge = ridge_model.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = ridge_model.score(X_test,y_test)
print(f"TEST - [Ridge Regr.] RMSE: {rmse_ridge:.4f} | R2: {r2_ridge:.4f}")

# after using L2 regularization our model is also much better but it seems that L1 regularization is slightly better then L2 in this case.

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_ridge = ridge_model.predict(X_train)

rmse_ridge = np.sqrt(mean_squared_error(y_train, y_pred_ridge))
r2_ridge = ridge_model.score(X_train,y_train)
print(f"TRAIN - [Ridge Regr.] RMSE: {rmse_ridge:.4f} | R2: {r2_ridge:.4f}")

## Ridge e K-Fold

### Configurazione del Modello

In [None]:
from sklearn.linear_model import Ridge
# Usiamo LogisticRegression (già nota)
model = Ridge(alpha=1, random_state=42)

### K-Fold

In [None]:
from sklearn.model_selection import KFold
# Vogliamo 5 round di validazione.
# Shuffle=True è fondamentale per mescolare i dati prima di tagliare.
cv = KFold(n_splits=5, shuffle=True, random_state=42)

### Esecuzione della Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
# scoring='f1': Usiamo F1-score perché l'accuratezza è inutile su dati sbilanciati
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')


### Conversione in positivo e calcolo RMSE

In [None]:
import numpy as np
mse_scores = -scores # Togliamo il -
rmse_scores = np.sqrt(mse_scores)  

In [None]:
print("\n--- Risultati Cross-Validation (5 Folds) ---")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse:.4f} | RMSE = {np.sqrt(mse):.4f}")

print("-" * 40)
print(f"MSE medio: {mse_scores.mean():.4f}")
print(f"RMSE medio: {rmse_scores.mean():.4f}")
print(f"Stabilità (std RMSE): +/- {rmse_scores.std():.4f}")