# Training a housing prediction model

Dataset source: https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres-geolocalisees/

In [1]:
import gzip
import io
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests

from pathlib import Path
from matplotlib_inline.backend_inline import set_matplotlib_formats
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

set_matplotlib_formats("svg")

## Extract dataset

In [2]:
def fetch_dataset(department: int = 38) -> Path:
    dataset_url = f"https://files.data.gouv.fr/geo-dvf/latest/csv/2021/departements/{department}.csv.gz"
    dataset_base_path = Path(".datasets") / "dvf"
    dataset_gz_path = dataset_base_path / f"{department}.csv.gz"
    dataset_csv_path = dataset_base_path / f"{department}.csv"
    dataset_base_path.mkdir(parents=True, exist_ok=True)
    
    if not dataset_csv_path.exists():
        response = requests.get(dataset_url)
        dataset_gz = io.BytesIO(response.content)
        with gzip.open(dataset_gz, "rb") as fgz:
            dataset_csv_path.write_bytes(fgz.read())
    
    return dataset_csv_path

In [3]:
dataset_path = fetch_dataset(38)
dataset_path

PosixPath('.datasets/dvf/38.csv')

## Quick look at the dataset

In [4]:
def load_dataset(filepath: Path) -> pd.DataFrame:
    return pd.read_csv(filepath)

In [5]:
dataset = load_dataset(dataset_path)

  return pd.read_csv(filepath)


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62513 entries, 0 to 62512
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_mutation                   62513 non-null  object 
 1   date_mutation                 62513 non-null  object 
 2   numero_disposition            62513 non-null  int64  
 3   nature_mutation               62513 non-null  object 
 4   valeur_fonciere               62426 non-null  float64
 5   adresse_numero                42050 non-null  float64
 6   adresse_suffixe               2281 non-null   object 
 7   adresse_nom_voie              61944 non-null  object 
 8   adresse_code_voie             61944 non-null  object 
 9   code_postal                   61942 non-null  float64
 10  code_commune                  62513 non-null  int64  
 11  nom_commune                   62513 non-null  object 
 12  code_departement              62513 non-null  int64  
 13  a

In [7]:
dataset.head()

Unnamed: 0,id_mutation,date_mutation,numero_disposition,nature_mutation,valeur_fonciere,adresse_numero,adresse_suffixe,adresse_nom_voie,adresse_code_voie,code_postal,...,type_local,surface_reelle_bati,nombre_pieces_principales,code_nature_culture,nature_culture,code_nature_culture_speciale,nature_culture_speciale,surface_terrain,longitude,latitude
0,2021-616370,2021-01-08,1,Vente,125000.0,7.0,,RUE DE LA FONTAINE,100,38610.0,...,Appartement,61.0,4.0,,,,,,5.798621,45.183397
1,2021-616371,2021-01-06,1,Vente,140000.0,24.0,,RUE DE STALINGRAD,6430,38100.0,...,Appartement,70.0,3.0,,,,,,5.726149,45.180546
2,2021-616372,2021-01-05,1,Vente,50000.0,12.0,,RUE ETIENNE MARCEL,2630,38000.0,...,Appartement,12.0,1.0,,,,,,5.717246,45.186591
3,2021-616373,2021-01-05,1,Vente,78000.0,87.0,,CRS JEAN JAURES,3820,38000.0,...,Appartement,21.0,1.0,,,,,,5.718199,45.184014
4,2021-616374,2021-01-06,1,Vente,103700.0,5.0,,RUE PIERRE DUPONT,5694,38000.0,...,Appartement,54.0,2.0,,,,,,5.71552,45.181276


In [8]:
dataset.describe()

Unnamed: 0,numero_disposition,valeur_fonciere,adresse_numero,code_postal,code_commune,code_departement,ancien_code_commune,ancien_nom_commune,ancien_id_parcelle,numero_volume,...,lot4_surface_carrez,lot5_numero,lot5_surface_carrez,nombre_lots,code_type_local,surface_reelle_bati,nombre_pieces_principales,surface_terrain,longitude,latitude
count,62513.0,62426.0,42050.0,61942.0,62513.0,62513.0,0.0,0.0,0.0,110.0,...,32.0,120.0,5.0,62513.0,40544.0,22824.0,40501.0,32529.0,61916.0,61916.0
mean,1.060084,534873.8,652.571082,38357.144635,38290.828548,38.0,,,,17.436364,...,78.559687,44.75,39.556,0.625694,2.34121,104.70439,1.741587,1634.701,5.541594,45.314073
std,0.266578,2507998.0,1798.144733,250.903457,166.829879,0.0,,,,65.201667,...,45.710658,76.695211,23.169066,0.829924,0.877463,493.36417,2.030322,10428.24,0.326891,0.2086
min,1.0,1.0,1.0,38000.0,38001.0,38.0,,,,1.0,...,8.32,2.0,12.79,0.0,1.0,1.0,0.0,1.0,4.758502,44.722616
25%,1.0,95512.5,9.0,38140.0,38171.0,38.0,,,,2.0,...,59.025,8.0,21.79,0.0,2.0,50.0,0.0,200.0,5.30414,45.174389
50%,1.0,180000.0,29.0,38300.0,38253.0,38.0,,,,3.0,...,68.04,20.0,37.0,0.0,2.0,74.0,1.0,563.0,5.663217,45.255687
75%,1.0,297500.0,139.0,38530.0,38440.0,38.0,,,,9.0,...,77.1,44.0,63.1,1.0,3.0,100.0,3.0,1200.0,5.743618,45.486939
max,5.0,52801610.0,9999.0,38980.0,38567.0,38.0,,,,638.0,...,200.05,557.0,63.1,45.0,4.0,35998.0,20.0,1353650.0,6.25627,45.877033


In [9]:
dataset["nature_mutation"].value_counts()

Vente                                 56676
Vente en l'état futur d'achèvement     5226
Echange                                 406
Vente terrain à bâtir                   109
Adjudication                             96
Name: nature_mutation, dtype: int64

In [10]:
dataset["type_local"].value_counts()

Dépendance                                  17601
Appartement                                 11892
Maison                                       8623
Local industriel. commercial ou assimilé     2428
Name: type_local, dtype: int64

## Preparation

In [11]:
features = ["type_local", "surface_reelle_bati", "nombre_pieces_principales"]
label = ["valeur_fonciere"]
filter_labels = dataset["valeur_fonciere"].notna()
filter_sells = dataset["nature_mutation"] == "Vente"
filter_type = dataset["type_local"].isin(["Appartement", "Maison"])
dataset_filtered = dataset[filter_labels & filter_sells & filter_type][features + label].reset_index(drop=True)

In [12]:
dataset_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19988 entries, 0 to 19987
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   type_local                 19988 non-null  object 
 1   surface_reelle_bati        19987 non-null  float64
 2   nombre_pieces_principales  19987 non-null  float64
 3   valeur_fonciere            19988 non-null  float64
dtypes: float64(3), object(1)
memory usage: 624.8+ KB


In [13]:
X = dataset_filtered[features]
y = dataset_filtered[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Training

In [14]:
imputer_pipeline = ColumnTransformer(
    [
        ("median", SimpleImputer(strategy="constant", fill_value=0), ["surface_reelle_bati", "nombre_pieces_principales"]),
    ]
)

num_pipeline = Pipeline(
    [
        ("imputer", imputer_pipeline),
        ("scaler", StandardScaler()),
    ]
)

transformers = ColumnTransformer(
    [
        ("num", num_pipeline, ["surface_reelle_bati", "nombre_pieces_principales"]),
        ("cat", OneHotEncoder(), ["type_local"]),
    ]
)

pipeline = Pipeline(
    [
        ("transformers", transformers),
        ("model", LinearRegression()),
    ]
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

pred_r2 = r2_score(y_test, y_pred)
pred_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"r2: {pred_r2}, rmse: {pred_rmse}")

r2: 0.0012885170693097692, rmse: 2288943.143726355


## Model serialization

In [15]:
def serialize_model(model, filepath: Path) -> None:
    joblib.dump(model, filepath)

In [16]:
model_path = Path(".") / "model.joblib"
serialize_model(pipeline, model_path)
model_path

PosixPath('model.joblib')