In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Modelling

Tester les différentes modèles, les différents hyperparamètre pour obtenir le meilleur modèle de prédiction possible

## Dataset

In [2]:
file_path = 'data/immodataParis.csv'
data = pd.read_csv(file_path)

## Fonctions

In [3]:
def preprocess_dates_rows(data, date_column, format='%d/%m/%Y') :
    abnormal_values = []

    for i in range(data[date_column].size) :

        # Si une date est dans son format, on fait rien
        try :
            pd.to_datetime(data[date_column].astype(str).iloc[i], format=format)

        # Sinon, on l'ajoute dans les valeurs anormales pour pouvoir les supprimer
        except :
            value = data[date_column].astype(str).iloc[i]
            print(value)
            abnormal_values.append(value)

    data_removed_rows = data.drop(data[data[date_column].astype(str).isin(abnormal_values)].index)
    data_removed_rows['Date de vente'] = pd.to_datetime(data_removed_rows['Date de vente'].astype(str), format=format)
    return data_removed_rows


def encode_cat_data(data, columns, encoder=LabelEncoder()) :

    data_copy = data.copy()
    if type(columns) != list :
        data_copy[columns] = encoder.fit_transform(data_copy[columns])
    else :
        for column in columns :
            data_copy[column] = encoder.fit_transform(data_copy[column])

    return data_copy

def remove_outliers_rows(data, contamination=0.005, random_state=42) :
    
    # Isolation Forest model
    isolation_forest = IsolationForest(contamination=contamination, random_state=random_state)  # You can adjust the contamination parameter
    isolation_forest.fit(data)
    outlier_predictions = isolation_forest.predict(data)
    outliers_count = (outlier_predictions == -1).sum()
    print(f"Total outliers identified: {outliers_count}")

    # Removing outliers
    data_copy = data.copy()
    data_copy['Outlier'] = outlier_predictions
    rows_to_delete = df[data_copy['Outlier'] == -1].index

    return data.drop(rows_to_delete, axis=0)

def remove_zeros_rows(data, columns) :
    data_copy = data.copy()
    if type(columns) != list :
        rows_to_remove = data_copy[data_copy[columns] == 0]
        return data_copy.drop(rows_to_remove.index)
    for column in columns :
        rows_to_remove = data_copy[data_copy[column] == 0]
        data_copy = data_copy.drop(rows_to_remove.index)
    return data_copy

## Preprocessing

In [4]:
# df with original values except dates
# df_bis with encoded values : Arrondissement, Type, Date de vente

df = data.copy()
df = df.drop(['Adresse', 'Ville'], axis=1)
df = preprocess_dates_rows(df, 'Date de vente')
df = remove_zeros_rows(df, ['Prix (€)', 'Prix mensuel (€)'])

df_bis = df.copy()
df_bis = encode_cat_data(df, ['Arrondissement', 'Type'])
df_bis['Date de vente'] = pd.to_numeric(df['Date de vente'])
df_bis = remove_outliers_rows(df_bis)

df = df.loc[df_bis.index]
df.head()


59
414
Total outliers identified: 9


Unnamed: 0,Arrondissement,Type,Prix (€),Prix mensuel (€),Pièce(s),Surface (m2),Date de vente
0,75001,Appartement,750400,14431,2,52,2023-05-22
1,75001,Appartement,330000,14348,1,23,2023-04-28
2,75001,Appartement,360100,15657,1,23,2023-03-29
3,75001,Appartement,286123,11005,1,26,2023-02-10
4,75001,Appartement,411636,12864,2,32,2022-12-28


In [5]:
pd.DataFrame([df.columns, df.dtypes.values])

Unnamed: 0,0,1,2,3,4,5,6
0,Arrondissement,Type,Prix (€),Prix mensuel (€),Pièce(s),Surface (m2),Date de vente
1,int64,object,int64,int64,int64,int64,datetime64[ns]


## Modelling