In [43]:
"""
auteur:Alexandre
date:2024/09/03

Preprocessing des données pur les rendre utilisable par l'algorithme de machine learning
"""

"\nauteur:Alexandre\ndate:2024/09/03\n\nPreprocessing des données pur les rendre utilisable par l'algorithme de machine learning\n"

# **Preprocessing des données**

In [44]:
import preprocessing as pp
import params as prm
import pandas as pd
import numpy as np
import os

## Elections législatives

In [45]:
data_1 = pp.load_data(file_name=f"{prm.dataset_1er_tour}.csv")
data_2 = pp.load_data(file_name=f"{prm.dataset_2nd_tour}.csv")
print(data_1.shape, data_2.shape)

(70102, 190) (61615, 55)


In [46]:
df_1, encodeur = pp.prepare(data_1, name=prm.dataset_1er_tour, encodeur=None)
df_2, encodeur = pp.prepare(data_2, name=prm.dataset_2nd_tour, encodeur=encodeur)
print(df_1.shape, df_2.shape)

Loading the already preprocessed dataset !
Loading the already preprocessed dataset !
(70102, 25) (61615, 20)


In [47]:
df_1f, df_2f = pp.clear(
    df_1, df_2, 
    name1=prm.dataset_1er_tour, 
    name2=prm.dataset_2nd_tour
)
print(df_1f.shape, df_2f.shape)

The cleaning phase has already been completed !
(61593, 23) (61593, 18)


## Adresses

In [48]:
path_pp_adresses = os.path.join(prm.datasets_pp_path, "communes-departement-region.csv")

if os.path.exists(path_pp_adresses):

    print("The preprocessing phase has already been completed !")
    adresses = pd.read_csv(
        path_pp_adresses
    )

else :

    print("We have to drop the NaN !")
    
    # Chargement des données
    adresses = pd.read_csv(
        os.path.join(prm.datasets_raw_path, "communes-departement-region.csv")
    )

    # Drop les colonnes et des lignes ayant des NaN
    col_drop = adresses.keys()[adresses.isna().sum() / adresses.shape[0] > 0.90]
    long_drop = adresses.index[adresses["longitude"].isna()]
    lat_drop = adresses.index[adresses["latitude"].isna()]
    ind_drop = np.unique(np.concatenate([long_drop,lat_drop]))

    adresses = adresses.drop(index=ind_drop, columns=col_drop)
    

    adresses.to_csv(path_pp_adresses, index=False)
print(adresses.shape)

The preprocessing phase has already been completed !
(38932, 13)


## Informations Socio-Economiques

In [49]:
file_name = "fr-en-dnb-par-etablissement.csv"
path_pp_dnb = os.path.join(prm.datasets_pp_path, file_name)

if os.path.exists(path_pp_dnb):
    
    print("The preprocessing phase has already been completed !")
    dnb_resultats = pd.read_csv(
        path_pp_dnb,
        low_memory=False
    )

else :

    print("We have to drop the NaN !")
    
    # Chargement des données
    dnb_resultats = pd.read_csv(
        os.path.join(prm.datasets_raw_path, file_name),
        sep=";"
    )

    # Drop des lignes ne voulant rien dire
    dnb_resultats = dnb_resultats[dnb_resultats["code_departement"] != '-']
    
    # Drop les colonnes et des lignes ayant des NaN
    dnb_resultats = dnb_resultats[dnb_resultats["patronyme"].notna()].sort_values(by="code_departement")
    dnb_resultats.to_csv(path_pp_dnb, index=False)

print(dnb_resultats.shape)

The preprocessing phase has already been completed !
(139168, 21)


In [50]:
file_name = "fr-en-boursiers-par-departement.csv"
path_pp_bourse = os.path.join(prm.datasets_pp_path, file_name)

if os.path.exists(path_pp_bourse):
    
    print("The preprocessing phase has already been completed !")
    dnb_resultats = pd.read_csv(
        path_pp_bourse
    )

else :

    print("We have to drop the NaN !")
    
    # Chargement des données
    boursier_departement = pd.read_csv(
        os.path.join(prm.datasets_raw_path, file_name),
        sep=";"
    )
    
    # Drop les colonnes et des lignes ayant des NaN
    boursier_departement = boursier_departement.drop(
        columns=boursier_departement.keys()[boursier_departement.isna().sum()/boursier_departement.shape[0] > 0]
        ).sort_values(by="numero_departement")
    boursier_departement.to_csv(path_pp_bourse, index=False)

print(boursier_departement.shape)

We have to drop the NaN !
(983, 8)


In [53]:
prm.WaTer()

Last updated: 2024-09-11T20:56:18.355836+02:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.24.0

Compiler    : MSC v.1938 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : AMD64 Family 25 Model 116 Stepping 1, AuthenticAMD
CPU cores   : 16
Architecture: 64bit

