In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

# Chargement des données nettoyées depuis le notebook 2
df = pd.read_csv("train_ready.csv")
print("Shape:", df.shape)
df.head()


Shape: (22234, 63)


Unnamed: 0,log_price,accommodates,bathrooms,cleaning_fee,host_has_profile_pic,host_identity_verified,host_response_rate,instant_bookable,latitude,longitude,...,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,city_Chicago,city_DC,city_LA,city_NYC,city_SF,n_amenities,desc_length,host_duration
0,4.317488,3,1.0,False,t,f,,t,33.782712,-118.13441,...,False,False,False,False,True,False,False,15,669,3492.0
1,4.007333,4,2.0,False,t,t,100%,t,40.705468,-73.909439,...,False,False,False,False,False,True,False,25,1000,3164.0
2,7.090077,6,2.0,False,t,f,,t,38.917537,-77.031651,...,False,False,False,True,False,False,False,20,494,3127.0
3,3.555348,1,1.0,True,t,t,100%,f,40.736001,-73.924248,...,False,False,False,False,False,True,False,30,1000,4717.0
4,5.480639,4,1.0,True,t,t,100%,f,37.744896,-122.430665,...,False,False,False,False,False,False,True,24,1000,4362.0


In [2]:
# Séparation features / target
X = df.drop("log_price", axis=1)
y = df["log_price"]

# 🔹 Étape 1 : Supprimer les colonnes textuelles ou inutiles restantes (si présentes)
drop_cols = ['neighbourhood', 'name', 'zipcode', 'description', 'amenities', 'first_review', 'last_review', 'host_since']
for col in drop_cols:
    if col in X.columns:
        X.drop(col, axis=1, inplace=True)

# 🔹 Étape 2 : Convertir les colonnes booléennes ou avec 't'/'f'
bool_cols = ['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
for col in bool_cols:
    if col in X.columns:
        X[col] = X[col].map({'t': 1, 'f': 0, 'True': 1, 'False': 0, True: 1, False: 0}).fillna(0)

# 🔹 Étape 3 : Nettoyer les pourcentages (host_response_rate)
if 'host_response_rate' in X.columns:
    X['host_response_rate'] = X['host_response_rate'].str.rstrip('%').astype(float)

# 🔹 Étape 4 : Forcer tous les types en float32
X = X.astype(np.float32)

# Vérification finale
print("Types de colonnes :", X.dtypes.value_counts())


Types de colonnes : float32    61
Name: count, dtype: int64


In [6]:
# 🔄 Remplacer les NaN restants par la moyenne de chaque colonne
X = X.fillna(X.mean())


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train shape :", X_train.shape)
print("Validation shape :", X_val.shape)


Train shape : (17787, 61)
Validation shape : (4447, 61)


In [4]:
baseline_pred = [y_train.mean()] * len(y_val)
baseline_rmse = sqrt(mean_squared_error(y_val, baseline_pred))
print(f"Baseline RMSE (mean predictor) : {baseline_rmse:.4f}")


Baseline RMSE (mean predictor) : 0.7159


In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

rmse_lr = sqrt(mean_squared_error(y_val, y_pred_lr))
print(f"Linear Regression RMSE : {rmse_lr:.4f}")


Linear Regression RMSE : 0.5840


In [9]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
rmse_ridge = sqrt(mean_squared_error(y_val, ridge.predict(X_val)))

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
rmse_lasso = sqrt(mean_squared_error(y_val, lasso.predict(X_val)))

print(f"Ridge RMSE : {rmse_ridge:.4f}")
print(f"Lasso RMSE : {rmse_lasso:.4f}")


Ridge RMSE : 0.4770
Lasso RMSE : 0.5834


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [10]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

rmse_rf = sqrt(mean_squared_error(y_val, y_pred_rf))
print(f"Random Forest RMSE : {rmse_rf:.4f}")


Random Forest RMSE : 0.4117


In [11]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)

rmse_xgb = sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f"XGBoost RMSE : {rmse_xgb:.4f}")


XGBoost RMSE : 0.4016


In [12]:
results = pd.DataFrame({
    'Modèle': ['Baseline', 'LinearRegression', 'Ridge', 'Lasso', 'RandomForest', 'XGBoost'],
    'RMSE': [baseline_rmse, rmse_lr, rmse_ridge, rmse_lasso, rmse_rf, rmse_xgb]
})

results = results.sort_values('RMSE').reset_index(drop=True)
results.style.background_gradient(cmap="viridis", subset=["RMSE"])


Unnamed: 0,Modèle,RMSE
0,XGBoost,0.401558
1,RandomForest,0.411729
2,Ridge,0.476998
3,Lasso,0.583391
4,LinearRegression,0.584001
5,Baseline,0.715868
