In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor

In [111]:
df = pd.read_csv('Melbourne_housing.csv')
df.head()

  df = pd.read_csv('Melbourne_housing.csv')


Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longtitude,Regionname,Propertycount,ParkingArea,Price
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/9/2016,2.5,3067.0,2.0,...,126.0,inf,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,Carport,
1,Airport West,154 Halsey Rd,3,t,PI,Nelson,3/9/2016,13.5,3042.0,3.0,...,303.0,225.0,2016.0,Moonee Valley City Council,-37.718,144.878,Western Metropolitan,3464.0,Detached Garage,840000.0
2,Albert Park,105 Kerferd Rd,2,h,S,hockingstuart,3/9/2016,3.3,3206.0,2.0,...,120.0,82.0,1900.0,Port Phillip City Council,-37.8459,144.9555,Southern Metropolitan,3280.0,Attached Garage,1275000.0
3,Albert Park,85 Richardson St,2,h,S,Thomson,3/9/2016,3.3,3206.0,2.0,...,159.0,inf,,Port Phillip City Council,-37.845,144.9538,Southern Metropolitan,3280.0,Indoor,1455000.0
4,Alphington,30 Austin St,3,h,SN,McGrath,3/9/2016,6.4,3078.0,3.0,...,174.0,122.0,2003.0,Darebin City Council,-37.7818,145.0198,Northern Metropolitan,2211.0,Parkade,


# Numero di valori nulli per ogni colonna

In [112]:
print(df.isnull().sum())

Suburb               0
Address              0
Rooms                0
Type                 0
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom           8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21097
YearBuilt        19306
CouncilArea          3
Latitude          7976
Longtitude        7976
Regionname           0
Propertycount        3
ParkingArea          0
Price             7610
dtype: int64


# PULIZIA

In [113]:
# Elimino le colonne con troppi valori nulli
df = df.drop(columns=["BuildingArea", "YearBuilt"])

# Calcolo la media delle colonne numeriche
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns

# Riempio i NaN delle colonne numeriche con la media
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

print(df.isnull().sum())

Suburb           0
Address          0
Rooms            0
Type             0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom          0
Bathroom         0
Car              0
Landsize         0
CouncilArea      3
Latitude         0
Longtitude       0
Regionname       0
Propertycount    0
ParkingArea      0
Price            0
dtype: int64


Le colonne non numeriche (come CouncilArea, ParkingArea, ecc.) non vengono toccate -> mantengono eventuali NaN

In [114]:
# Sostituisco stringhe vuote "" con NaN (altrimenti dropna non le vede)
df = df.replace("", np.nan)

# Elimino le righe dove qualsiasi colonna stringa è vuota
string_cols = df.select_dtypes(include=["object"]).columns

df = df.dropna(subset=string_cols, how="any")

print(df.isnull().sum())

Suburb           0
Address          0
Rooms            0
Type             0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom          0
Bathroom         0
Car              0
Landsize         0
CouncilArea      0
Latitude         0
Longtitude       0
Regionname       0
Propertycount    0
ParkingArea      0
Price            0
dtype: int64


In [115]:
df.nunique()

Suburb             350
Address          34006
Rooms               12
Type                 3
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom             16
Bathroom            12
Car                 16
Landsize          1685
CouncilArea         33
Latitude         13403
Longtitude       14525
Regionname           8
Propertycount      342
ParkingArea          8
Price             2872
dtype: int64

In [116]:
df.shape

(34854, 20)

## Colonne che usiamo

In [117]:
# now lets observe our data and try to drop some columns that are not very uselful in our data analysis. I am just dropping some columns
# like date, latitute etc that are not very meaningful for our analysis. 
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom', 'Bathroom', 'Car', 'Landsize', 'Price', 'ParkingArea']
df_new = df[columns_to_use]
df_new

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom,Bathroom,Car,Landsize,Price,ParkingArea
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.000000,1.050173e+06,Carport
1,Airport West,3,t,PI,Nelson,Western Metropolitan,3464.0,13.5,Moonee Valley City Council,3.0,2.0,1.0,303.000000,8.400000e+05,Detached Garage
2,Albert Park,2,h,S,hockingstuart,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,120.000000,1.275000e+06,Attached Garage
3,Albert Park,2,h,S,Thomson,Southern Metropolitan,3280.0,3.3,Port Phillip City Council,2.0,1.0,0.0,159.000000,1.455000e+06,Indoor
4,Alphington,3,h,SN,McGrath,Northern Metropolitan,2211.0,6.4,Darebin City Council,3.0,2.0,1.0,174.000000,1.050173e+06,Parkade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Reservoir,3,u,SP,RW,Northern Metropolitan,21650.0,12.0,Darebin City Council,3.0,1.0,1.0,593.598993,4.750000e+05,Parkade
34853,Roxburgh Park,4,h,S,Raine,Northern Metropolitan,5833.0,20.6,Hume City Council,4.0,2.0,2.0,593.598993,5.910000e+05,Underground
34854,Springvale South,4,h,PI,Barry,South-Eastern Metropolitan,4054.0,22.2,Greater Dandenong City Council,4.0,2.0,2.0,534.000000,1.050173e+06,Carport
34855,Springvale South,3,h,S,Harcourts,South-Eastern Metropolitan,4054.0,22.2,Greater Dandenong City Council,3.0,2.0,1.0,544.000000,7.805000e+05,Detached Garage


In [118]:
df_new.shape

(34854, 15)

## Variabili dummy

In [119]:
# now we are good to go with out cleaned data. Now we are going to make dummy variables for our whole dataset.
df_new = pd.get_dummies(df_new, drop_first=True) # it is a short cut to avoid dummy variable trap it is just dropping the main column whose dummies we have produced. 
df_new

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom,Bathroom,Car,Landsize,Price,Suburb_Aberfeldie,Suburb_Airport West,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,ParkingArea_Carport,ParkingArea_Detached Garage,ParkingArea_Indoor,ParkingArea_Outdoor Stall,ParkingArea_Parkade,ParkingArea_Parking Pad,ParkingArea_Underground
0,2,4019.0,2.5,2.0,1.0,1.0,126.000000,1.050173e+06,False,False,...,False,True,False,True,False,False,False,False,False,False
1,3,3464.0,13.5,3.0,2.0,1.0,303.000000,8.400000e+05,False,True,...,False,False,False,False,True,False,False,False,False,False
2,2,3280.0,3.3,2.0,1.0,0.0,120.000000,1.275000e+06,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2,3280.0,3.3,2.0,1.0,0.0,159.000000,1.455000e+06,False,False,...,False,False,False,False,False,True,False,False,False,False
4,3,2211.0,6.4,3.0,2.0,1.0,174.000000,1.050173e+06,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,3,21650.0,12.0,3.0,1.0,1.0,593.598993,4.750000e+05,False,False,...,False,False,False,False,False,False,False,True,False,False
34853,4,5833.0,20.6,4.0,2.0,2.0,593.598993,5.910000e+05,False,False,...,False,False,False,False,False,False,False,False,False,True
34854,4,4054.0,22.2,4.0,2.0,2.0,534.000000,1.050173e+06,False,False,...,False,False,False,True,False,False,False,False,False,False
34855,3,4054.0,22.2,3.0,2.0,1.0,544.000000,7.805000e+05,False,False,...,False,False,False,False,True,False,False,False,False,False


# MACHINE LEARNING MODEL

In [120]:
X = df_new.drop('Price', axis='columns')
y = df_new.Price

X = X.to_numpy()
y = y.to_numpy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = (y - y.mean()) / y.std()

## Funzione di valutazione

Valuta un modello di regressione usando:
- KFold cross-validation se cv è fornito
- Train/Test split se train_test_split_ratio è fornito (es. 0.8)
    
Restituisce un dizionario con:
- MSE_mean
- MSE_std
- RMSE_mean
- RMSE_std
- R2_mean
- R2_std

### Esempio d'uso

Con KFold:
- kf = KFold(n_splits=5, shuffle=True, random_state=42)
- scores = evaluate_model_reg(model, X, y, cv=kf)

Con train/test split 80/20:
- scores_split = evaluate_model_reg(model, X, y, train_test_split_ratio=0.8)

In [121]:
def evaluate_model_reg(model, X_scaled, y_scaled, cv=None, train_test_split_ratio=None, random_state=42):
    scores_mse, scores_rmse, scores_r2 = [], [], []

    # Se X_scaled è fornito, usalo; altrimenti scala dentro la funzione
    if X_scaled is not None and y_scaled is not None:
        X_to_use = X_scaled
        y_to_use = y_scaled
    else:
        scaler = StandardScaler()
        X_to_use = scaler.fit_transform(X)
        y_to_use = (y - y.mean()) / y.std()

    if cv is not None:
        
        for train_idx, test_idx in cv.split(X_to_use):
            X_train, X_test = X_to_use[train_idx], X_to_use[test_idx]
            y_train, y_test = y_to_use[train_idx], y_to_use[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores_mse.append(mean_squared_error(y_test, y_pred))
            scores_rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
            scores_r2.append(r2_score(y_test, y_pred))

    elif train_test_split_ratio is not None:
        
        train_idx, test_idx = train_test_split(np.arange(len(y_to_use)), train_size=train_test_split_ratio, random_state=random_state)
        
        X_train, X_test = X_to_use[train_idx], X_to_use[test_idx]
        y_train, y_test = y_to_use[train_idx], y_to_use[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        scores_mse.append(mean_squared_error(y_test, y_pred))
        scores_rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        scores_r2.append(r2_score(y_test, y_pred))

    else:
        raise ValueError("Devi fornire cv oppure train_test_split_ratio")

    return {
        "MSE_mean": np.mean(scores_mse),
        "MSE_std": np.std(scores_mse),
        "RMSE_mean": np.mean(scores_rmse),
        "RMSE_std": np.std(scores_rmse),
        "R2_mean": np.mean(scores_r2),
        "R2_std": np.std(scores_r2)
    }


## Definizione CV

In [122]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

## Lista modelli

In [123]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(alpha=0.1, max_iter=5000, random_state=42),
    "Ridge": Ridge(alpha=1.0, max_iter=5000, random_state=42)
}

## Valutazione modelli lineari

In [124]:
results = []

for name, model in models.items():
    # Train/Test Split
    scores_split = evaluate_model_reg(model, X_scaled, y_scaled, train_test_split_ratio=0.8)
    scores_split["model"] = name
    scores_split["setting"] = "Train/Test Split 80/20"
    results.append(scores_split)

    # KFold
    scores_kf = evaluate_model_reg(model, X_scaled, y_scaled, cv=kf)
    scores_kf["model"] = name
    scores_kf["setting"] = "KFold 3-fold"
    results.append(scores_kf)

## XGBoost con combinazioni iperparametri

In [125]:
xgb_param_grid = [
    {"n_estimators": 50, "max_depth": 3},
    {"n_estimators": 100, "max_depth": 3},
    {"n_estimators": 100, "max_depth": 5},
]

for params in xgb_param_grid:
    model = XGBRegressor(**params, random_state=42, n_jobs=-1, verbosity=0)

    # Train/Test Split
    scores_split = evaluate_model_reg(model, X_scaled, y_scaled, train_test_split_ratio=0.8)
    scores_split["model"] = f"XGBoost (n={params['n_estimators']}, d={params['max_depth']})"
    scores_split["setting"] = "Train/Test Split 80/20"
    results.append(scores_split)

    # KFold
    scores_kf = evaluate_model_reg(model, X_scaled, y_scaled, cv=kf)
    scores_kf["model"] = f"XGBoost (n={params['n_estimators']}, d={params['max_depth']})"
    scores_kf["setting"] = "KFold 3-fold"
    results.append(scores_kf)

# TABELLA RISULTATI

In [126]:
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="RMSE_mean")
display(df_results.style.hide(axis="index"))

MSE_mean,MSE_std,RMSE_mean,RMSE_std,R2_mean,R2_std,model,setting
0.456743,0.015572,0.675729,0.011582,0.543155,0.004079,"XGBoost (n=100, d=5)",KFold 3-fold
0.475469,0.020085,0.689387,0.014675,0.524506,0.00929,"XGBoost (n=100, d=3)",KFold 3-fold
0.478935,0.0,0.692052,0.0,0.542867,0.0,"XGBoost (n=100, d=5)",Train/Test Split 80/20
0.485999,0.020499,0.696979,0.014805,0.514039,0.00515,"XGBoost (n=50, d=3)",KFold 3-fold
0.493886,0.0,0.70277,0.0,0.528597,0.0,"XGBoost (n=100, d=3)",Train/Test Split 80/20
0.505332,0.0,0.710867,0.0,0.517672,0.0,"XGBoost (n=50, d=3)",Train/Test Split 80/20
0.52691,0.027065,0.725644,0.018746,0.473342,0.007027,Ridge,KFold 3-fold
0.526967,0.027061,0.725683,0.018743,0.473284,0.007033,LinearRegression,KFold 3-fold
0.548652,0.0,0.74071,0.0,0.476324,0.0,Ridge,Train/Test Split 80/20
0.548666,0.0,0.74072,0.0,0.47631,0.0,LinearRegression,Train/Test Split 80/20
