In [1]:
# Modules imports 

from urllib.request import urlopen
from zipfile import ZipFile 
from io import BytesIO
import geopandas as gpd
import pandas as pd
import numpy as np 
import os 
from shapely import wkt


In [2]:
# Download the file and extract the corresponding csv file 

dict_regions_url = {
    'region_75' : 'https://www.data.gouv.fr/fr/datasets/r/ffea2b26-a321-4c46-a581-16086009f786',
    'region_93' : 'https://www.data.gouv.fr/fr/datasets/r/6cdf9269-3c4e-471e-aa62-d4f40bc268b7'
}


def extract_files_from_url(url, directory_extract_name):
    
    resp = urlopen(url)
    myzip = ZipFile(BytesIO(resp.read()))

    #Extract all the files in the given directory name
    myzip.extractall(path = directory_extract_name)

    #Remove all the files that are not csv 
    files_in_directory = os.listdir(directory_extract_name)
    filtered_files = [file for file in files_in_directory if not file.endswith(".csv")]
    for file in filtered_files:
        path_to_file = os.path.join(directory_extract_name, file)
        os.remove(path_to_file)


for region_name, url in dict_regions_url.items() : 
    extract_files_from_url(url, region_name)


Load the saved CSV files in pandas : We choose that data format to perform easily pre-processing and feature engineering on the dataset.The size of the datasets make it possible to use Pandas and Dataframes. Also, we concatenate the two different dataframes.
After, we convert these dataframes in geopandas that are better suited for geometrical columns : points, polygons...

In [3]:

df_adresse_region_75 = pd.read_csv(
    r'region_75\adresse.csv', sep = ';', low_memory = False)
df_batiment_region_75 = pd.read_csv(
    r'region_75\batiment.csv', sep = ';', low_memory = False)
df_rel_adresse_batiment_opendata_region_75 = pd.read_csv(
    r'region_75\rel_adresse_batiment_opendata.csv', sep = ';', low_memory = False)

df_adresse_region_93 = pd.read_csv(r'region_93\adresse.csv', sep = ';', low_memory = False)
df_batiment_region_93 = pd.read_csv(r'region_93\batiment.csv', sep = ';', low_memory = False)
df_rel_adresse_batiment_opendata_region_93 = pd.read_csv(
    r'region_93\rel_adresse_batiment_opendata.csv', sep = ';', low_memory = False)


df_adresse_all_regions = pd.concat([df_adresse_region_75, df_adresse_region_93]).reset_index(drop=True)
df_batiments_all_regions = pd.concat([df_batiment_region_75, df_batiment_region_93]).reset_index(drop=True)
df_rel_adresse_batiment_opendata_all_regions = pd.concat(
    [df_rel_adresse_batiment_opendata_region_75, df_rel_adresse_batiment_opendata_region_93]).reset_index(drop=True)


In [4]:
#Finally, we convert to geopandas dataframes : 

df_adresse_all_regions['etaban202111_geomadr'] = df_adresse_all_regions['etaban202111_geomadr'].apply(wkt.loads)
gdf_adresse_all_regions= gpd.GeoDataFrame(df_adresse_all_regions, crs='epsg:4326', geometry = 'etaban202111_geomadr')

# We drop the NaN values for the geometry in order to be able to do the conversion (241 items dropped)

df_batiments_all_regions['geombui'] = df_batiments_all_regions['geombui'].apply(
    lambda x: wkt.loads(x) if isinstance(x, str) else None
)
gdf_batiments_all_regions= gpd.GeoDataFrame(df_batiments_all_regions, crs='epsg:4326', geometry = 'geombui')

df_rel_adresse_batiment_opendata_all_regions['st_transform'] = df_rel_adresse_batiment_opendata_all_regions['st_transform'].apply(
    lambda x: wkt.loads(x) if isinstance(x, str) else None
)
gdf_rel_adresse_batiment_opendata_all_regions= gpd.GeoDataFrame(df_rel_adresse_batiment_opendata_all_regions, crs='epsg:4326', geometry = 'st_transform')

In [6]:
#Visualize a dataframe

gdf_adresse_all_regions.head()

Unnamed: 0,etaban202111_geomadr,etaban202111_id,etaban202111_numero,etaban202111_voie,etaban202111_code_postal,etaban202111_code_insee,etaban202111_ville,insee_code_dept,fiabilite_niv_1,fiabilite_niv_2,etaban202111_latitude,etaban202111_longitude
0,POINT (2.33599 48.86235),75101,,Paris 1er Arrondissement,75001,75101,Paris 1er Arrondissement,75,mauvais géocodage,géocodage municipality,48.862351,2.335987
1,POINT (2.32950 48.86505),75101_0191_00003,3.0,Rue d’Alger,75001,75101,Paris,75,fiable BDNB,mono bat BDNB,48.865052,2.329503
2,POINT (2.32956 48.86488),75101_0191_00004,4.0,Rue d’Alger,75001,75101,Paris,75,fiable FF,mono bat FF,48.864881,2.329557
3,POINT (2.32962 48.86521),75101_0191_00005,5.0,Rue d’Alger,75001,75101,Paris,75,fiable FF,mono bat FF,48.865213,2.329623
4,POINT (2.32965 48.86501),75101_0191_00006,6.0,Rue d’Alger,75001,75101,Paris,75,fiable FF,mono bat FF,48.865006,2.329649


In [8]:
# Get the shapes of every created GeoDataFrames 

print(' gdf_adresse_all_regions shape : ' , gdf_adresse_all_regions.shape)
print(' gdf_batiments_all_regions shape : ' , gdf_batiments_all_regions.shape)
print(' gdf_rel_adresse_batiment_opendata_all_regions shape : ' , gdf_rel_adresse_batiment_opendata_all_regions.shape)

 gdf_adresse_all_regions shape :  (280850, 12)
 gdf_batiments_all_regions shape :  (246746, 170)
 gdf_rel_adresse_batiment_opendata_all_regions shape :  (295894, 11)


Remarks : for efficient storage and if we have bigger datasets, we can save the csv files using parquet formats, that enable to 
limit the storage needed for big datasets 