In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer

In [7]:
# Charger les données
data = pd.read_csv("total_out_clean.csv")
data.shape


(753, 13)

In [11]:
from sklearn.linear_model import LinearRegression
# Data preprocessing
data['period_start'] = pd.to_datetime(data['period_start'])
data['period_end'] = pd.to_datetime(data['period_end'])
data['duration'] = (data['period_end'] - data['period_start']).dt.days

# Define features and target
X = data[['old_price', 'Type', 'pays', 'region', 'voyageurs', 'rooms', 'bed', 'bathroom', 'duration', 'new_rating']]
y = data['price']

# Identify numeric and categorical features
numeric_features = ['old_price', 'voyageurs', 'rooms', 'bed', 'bathroom', 'duration', 'new_rating']
categorical_features = ['Type', 'pays', 'region']

# Define preprocessors for numeric and categorical features separately
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
lr = LinearRegression()

# Model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), ('linear_regression', lr)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train
model.fit(X_train, y_train)

# Use the model to predict and score
score_lr = model.score(X_test, y_test)
print(f'Linear Regression Model R² score: {score_lr}')

Linear Regression Model R² score: 0.5092203332892862


In [12]:
from xgboost import XGBRegressor

# Define the model
xgb = XGBRegressor(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100],   
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01]
}

# Set up Grid Search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=4, n_jobs=-1, scoring='r2')
model = Pipeline(steps=[('preprocessor', preprocessor),('grid_search', grid_search)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Train 
model.fit(X_train, y_train)
best_params = model.named_steps['grid_search'].best_params_
print(f'Best parameters for XGBoost: {best_params}')

# Use the pipeline to predict and score
score_xgb = model.score(X_test, y_test)
print(f'XGBoost Model R² score: {score_xgb}')

Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
XGBoost Model R² score: 0.7451998839185947


In [13]:
PRIX = model.predict(X_test.iloc[:1])

print("Cet appartement Coute : {} euros".format(PRIX[0]))

Cet appartement Coute : 793.7935791015625 euros


# Analyse

Régression Linéaire :
R² Score : 0.5092
Le modèle de régression linéaire explique environ 51% de la variance des prix des appartements. Ce score indique que le modèle a une performance modérée, mais il laisse encore beaucoup de variance non expliquée.

XGBoost :
R² Score : 0.7452
Le modèle XGBoost explique environ 75% de la variance des prix des appartements. Ce score est nettement supérieur à celui de la régression linéaire, ce qui indique que XGBoost capture beaucoup mieux les relations complexes dans les données.

Cela montre que XGboost est plus adapté pour ce jeu de données et est capable de modéliser des relations plus complexes entre les variables indépendantes et le prix des appartements.