# Model training script

***

### Loading dataset and cleaning it

In [1]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('transactions.csv')

all_columns = df.columns.tolist()
print("Columns in the dataset:", all_columns)


Columns in the dataset: ['Unnamed: 0', 'id_transaction', 'date_transaction', 'prix', 'departement', 'id_ville', 'ville', 'code_postal', 'adresse', 'type_batiment', 'vefa', 'n_pieces', 'surface_habitable', 'id_parcelle_cadastre', 'latitude', 'longitude', 'surface_dependances', 'surface_locaux_industriels', 'surface_terrains_agricoles', 'surface_terrains_sols', 'surface_terrains_nature']


# Data Preprocessing and Feature Engineering for Paris Real Estate Price Prediction

In [2]:
# Creating a new feature 'prix_m2'
df['prix_m2'] = df['prix'] / df['surface_habitable']

# Filter for Paris data in 2022
idf_df = df[(df['departement'].isin([75, 77, 78, 91, 92, 93, 94, 95])) & (df['date_transaction'].str.startswith('2022-'))].copy()

# Modify the features to exclude the 'date_transaction' column
X = idf_df[['code_postal', 'n_pieces', 'surface_habitable', 'latitude', 'longitude']]

# Add one-hot encoding for the 'date_transaction' column
X = pd.concat([X, pd.get_dummies(idf_df['date_transaction'], prefix='date')], axis=1)

# Define target variable
y = idf_df['prix_m2']


    # Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

    # Create transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer to apply transformations to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    # Create a pipeline that first transforms the data and then fits the model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Preprocess the features
X_processed = model_pipeline.fit_transform(X)

    # Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

    # Create transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Create a column transformer to apply transformations to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
     ])

    # Create a pipeline that first transforms the data and then fits the model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Preprocess the features
X_processed = model_pipeline.fit_transform(X)



In [3]:
# Preprocess the features
X_processed = model_pipeline.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.8, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.8, random_state=42)

In [4]:
if 'Unnamed: 0' in df.columns:
   df = df.drop('Unnamed: 0', axis=1)
    
df['prix_m2'] = df['prix'] / (df['surface_habitable'])

idf_df = df[(df.departement == 75) & (df.n_pieces == 4) & (df.date_transaction.str.startswith('2022-'))]
surface_cols = [c for c in idf_df.columns if 'surface_' in c and c != 'surface_habitable']
for c in surface_cols:
     idf_df[c + '_sum'] = idf_df[c].apply(lambda x: sum(eval(x)) if 'NULL' not in x else 0)
idf_df = idf_df[idf_df[[c + '_sum' for c in surface_cols]].sum(axis=1) == 0]

### Data split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

### Import models + training, outil de prediction , de maniere lineaire 

In [6]:
import pickle
import pandas as pd

# Chargez le modèle depuis le fichier .pkl en mode binaire
with open('regression_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Exemple de nouvelles données pour les prédictions
new_data = pd.DataFrame({
    'code_postal': [75001],
    'n_pieces': [3],
    'surface_habitable': [80],
    'latitude': [48.8566],
    'longitude': [2.3522],
    'date_transaction': ['2022-01-01']
})

# Appliquez le même prétraitement sur les nouvelles données
new_data_processed = model_pipeline.transform(new_data)

# Faites des prédictions sur les nouvelles données à l'aide du modèle chargé
predicted_price_per_m2 = loaded_model.predict(new_data_processed)

print(f'Predicted Price per m2 for the new data: {predicted_price_per_m2[0]}')




Predicted Price per m2 for the new data: 8324.14119041518


In [7]:
params_grid = {
    'DTR': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': (1, 101, 10),
            'min_samples_split': (2, 21, 2)
        }
    },
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': (1, 51, 5)
        }
    },
    'LR': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
            'positive': [True, False]
        }}}

for model_name, model_config in tqdm(params_grid.items()):
    gs = GridSearchCV(estimator=model_config['model'], param_grid=model_config['params'])
    # gs = GridSearchCV(estimator=model_config['model'], param_grid=model_config['params'], n_jobs=-1)
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    best_params = gs.best_params_
    train_rmse = np.sqrt(mean_squared_error(y_train, best_model.predict(X_train)))
    test_rmse = np.sqrt(mean_squared_error(y_test, best_model.predict(X_test)))
    score = best_model.score(X_test, y_test)

    print(f"Model: {model_name}")
    print(f"Optimal params: {best_params}") 
    print(f"Train RMSE: {train_rmse}")
    print(f"Test RMSE: {test_rmse}")
    print(f"Model Score: {score}")
    print()

 33%|███▎      | 1/3 [01:04<02:08, 64.18s/it]

Model: DTR
Optimal params: {'max_depth': 1, 'min_samples_split': 2}
Train RMSE: 26419.737018819014
Test RMSE: 18480.923721835952
Model Score: 0.019721920908795654



 67%|██████▋   | 2/3 [04:58<02:44, 164.15s/it]

Model: KNN
Optimal params: {'n_neighbors': 51}
Train RMSE: 26086.0212611758
Test RMSE: 18689.18914151
Model Score: -0.0024964896546602056



100%|██████████| 3/3 [06:55<00:00, 138.66s/it]

Model: LR
Optimal params: {'fit_intercept': True, 'positive': False}
Train RMSE: 26396.75607753689
Test RMSE: 18730.929715018374
Model Score: -0.0069794563922978






API 

In [8]:
from fastapi import FastAPI
import uvicorn
app = FastAPI()

@app.get("/prediction_immobiliere")  # Corriger le chemin en supprimant les espaces
async def predict_params(code_postal=75001, n_pieces=3, surface_habitable=80, latitude=48.8566, longitude=2.3522, date_transaction='2022-01-01'):
    # Votre code pour prédire avec les paramètres donnés
    return {"message": "Placeholder pour la prédiction immobilière"}  # Placeholder pour le code réel

if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8002)


RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:

from fastapi import FastAPI, HTTPException, Depends
import sqlite3, uvicorn
from datetime import datetime
app = FastAPI()

In [None]:
@app.get("/Prédiciton immobiliere ")
async def predict_params(code_postal=75001, n_pieces=3, surface_habitable=80, latitude=48.8566, longitude=2.3522, date_transaction='2022-01-01'):
    # Votre code pour prédire avec les paramètres donnés
    pass  # Placeholder pour le code réel
predict_params()


In [None]:
if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8002)