# Training a housing prediction model

Dataset source: https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres-geolocalisees/

In [47]:
import gzip
import io
import os
import glob
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests

from pathlib import Path
from matplotlib_inline.backend_inline import set_matplotlib_formats
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

set_matplotlib_formats("svg")

## Extract dataset

In [2]:
def fetch_dataset(department: int) -> Path:
    dataset_url = f"https://files.data.gouv.fr/geo-dvf/latest/csv/2021/departements/{department}.csv.gz"
    dataset_base_path = Path(".datasets")
    dataset_gz_path = dataset_base_path / f"{department}.csv.gz"
    dataset_csv_path = dataset_base_path / f"{department}.csv"
    dataset_base_path.mkdir(parents=True, exist_ok=True)
    
    if not dataset_csv_path.exists():
        response = requests.get(dataset_url)
        dataset_gz = io.BytesIO(response.content)
        with gzip.open(dataset_gz, "rb") as fgz:
            dataset_csv_path.write_bytes(fgz.read())
    
    return dataset_csv_path

In [3]:
#Download the data for every department
for i in range(1, 96):
    if i == 20 or i == 57 or i == 67 or i == 68:
        continue
    #add the leading zeros for the first 10 numbers
    dataset_path = fetch_dataset(f"{i:02d}")
    
    df = pd.read_csv(dataset_path)
    
    #drop rows if they have na values for "code_postal"
    df.dropna(subset=['code_postal'], axis=0, inplace=True)
    
    #drop rows if they have na values for "valeur_fonciere"
    df.dropna(subset=['valeur_fonciere'], axis=0, inplace=True)
    
    df["code_postal"] = df["code_postal"].map(int).map(str).str.zfill(5)
    df["Departement"] = i
    df.to_csv(dataset_path, index=False)

  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.read_csv(dataset_path)
  df = pd.

## Quick look and load at the dataset

In [19]:
def load_dataset(filepath) -> pd.DataFrame:
    return pd.read_csv(filepath, converters={'code_postal': str, "code_commune": str})

In [91]:
all_files = glob.glob(os.path.join(os.getcwd(), ".datasets", "*.csv"))

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

dataset = pd.concat(li, axis=0, ignore_index=True)

  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.

In [39]:
#dataset.head()

In [93]:
#dataset.info()

In [36]:
#dataset.head()

In [31]:
#dataset.describe()

In [32]:
#dataset["nature_mutation"].value_counts()

In [34]:
#dataset["type_local"].value_counts()

## Preparation

In [28]:
features = ["type_local", "surface_reelle_bati", "nombre_pieces_principales", "code_postal", "adresse_nom_voie"]
label = ["valeur_fonciere"]
filter_labels = dataset["valeur_fonciere"].notna()
filter_sells = dataset["nature_mutation"] == "Vente"
filter_type = dataset["type_local"].isin(["Appartement", "Maison"])
dataset_filtered = dataset[filter_labels & filter_sells & filter_type][features + label].reset_index(drop=True)

In [94]:
#dataset_filtered.info()

In [13]:
X = dataset_filtered[features]
y = dataset_filtered[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Training

In [14]:
imputer_pipeline = ColumnTransformer(
    [
        ("median", SimpleImputer(strategy="constant", fill_value=0), ["surface_reelle_bati", "nombre_pieces_principales"]),
    ]
)

num_pipeline = Pipeline(
    [
        ("imputer", imputer_pipeline),
        ("scaler", StandardScaler()),
    ]
)

transformers = ColumnTransformer(
    [
        ("num", num_pipeline, ["surface_reelle_bati", "nombre_pieces_principales"]),
        ("cat", OneHotEncoder(), ["type_local"]),
    ]
)

pipeline = Pipeline(
    [
        ("transformers", transformers),
        ("model", LinearRegression()),
    ]
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

pred_r2 = r2_score(y_test, y_pred)
pred_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"r2: {pred_r2}, rmse: {pred_rmse}")

r2: 0.020112605079169343, rmse: 2835001.3053178713


## Model serialization

In [15]:
def serialize_model(model, filepath: Path) -> None:
    joblib.dump(model, filepath)

In [16]:
model_path = Path(".") / "model.joblib"
serialize_model(pipeline, model_path)
model_path

WindowsPath('model.joblib')