In [5]:
import re
import numpy as np
from collections import namedtuple

In [6]:
import pandas as pd
df = pd.read_csv("listings.csv", names=["id", "type", "nb_piece", "loc", "surface", "prix"], header=0).drop_duplicates()

In [7]:
#Modification des types
df['id'] = df['id'].astype(str)
df['type'] = df['type'].astype(str)
df['nb_piece'] = pd.to_numeric(df['nb_piece'], errors='coerce').astype('Int64')
df['loc'] = df['loc'].astype(str)
df['surface'] = pd.to_numeric(df['surface'], errors='coerce').astype('Int64')
df['prix'] =pd.to_numeric(df['prix'].str.replace(' ', '').str.replace('€', ''), errors='coerce').astype('Int64')

In [8]:
df.dtypes

id          object
type        object
nb_piece     Int64
loc         object
surface      Int64
prix         Int64
dtype: object

In [9]:
#Gerer les NAN
df.loc[(df['type'] == 'Studio') & (pd.isnull(df['nb_piece'])), 'nb_piece'] = 1 # Si studio, 1 chambre
df

Unnamed: 0,id,type,nb_piece,loc,surface,prix
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000
...,...,...,...,...,...,...
19726,ag755800-412953689,Appartement,1,75020 Paris 20e (Père Lachaise - Réunion),35,239600
19753,century-21-202_480_27885,Appartement,3,75020 Paris 20e (Plaine - Lagny),63,539000
19758,netty-company34605jrw-appt-3212,Appartement,1,75020 Paris 20e (Gambetta),27,215000
19767,demathieu-bard-immobilier-PR5E0076,GAMBETTA,,75020 Paris 20e (Gambetta),,


In [10]:
#Supprimer les lignes restantes avec des NaN qui sont ,ici, des lignes avec soit nb_piece, surface, prix None
df=df.dropna()

In [11]:
#Filtration pour ne pas avoir des types de batiments mals renseigné
df["count"] = df['type'].map(df['type'].value_counts()) #Enlever les types avec trop peu d'apparitions (qui sont des erreurs)
df=df[df["count"] > 10]
df=df.drop("count", axis=1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["count"] = df['type'].map(df['type'].value_counts()) #Enlever les types avec trop peu d'apparitions (qui sont des erreurs)


Unnamed: 0,id,type,nb_piece,loc,surface,prix
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000
...,...,...,...,...,...,...
19720,apimo-83966258,Appartement,1,75020 Paris 20e (Père Lachaise - Réunion),22,239000
19726,ag755800-412953689,Appartement,1,75020 Paris 20e (Père Lachaise - Réunion),35,239600
19753,century-21-202_480_27885,Appartement,3,75020 Paris 20e (Plaine - Lagny),63,539000
19758,netty-company34605jrw-appt-3212,Appartement,1,75020 Paris 20e (Gambetta),27,215000


In [12]:
# Uniformiser le format des localisation
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)
df_ = df.copy()

# Extraire les informations entre parenthèses (quartiers) et les garder dans loc2
df_['loc1'] = df_['loc'].str.extract(r'\((.*?)\)')

# Extraire la première partie correspondant au quartier, si elle est présente
df_['loc2'] = df_['loc1'].str.extract(r'^(.*?)\s-\s')
df_['loc2'] = df_['loc2'].fillna(df_['loc1'])

# Extraire les arrondissements (750XX) dans loc3
df_['loc3'] = df_["loc"].str.extract(r'(750\d{2} [A-Za-z0-9 ]+ \d{1,2}[a-zA-Z]+)')

# Prioriser loc1 (quartiers) et, si absent, utiliser loc3 (arrondissements)
df_['loc1'] = df_['loc1'].fillna(df_['loc3'])

df_=df_.dropna(subset=['loc1', 'loc3'], how='all')


df_zones=df_[["loc1", "loc2", 'loc3']].drop_duplicates().copy()
df=df_.copy()

In [13]:
df[~df["loc3"].astype(str).str.startswith("750", na=False)]

Unnamed: 0,id,type,nb_piece,loc,surface,prix,loc1,loc2,loc3
1033,immo-facile-56299289,Maison,4,78110 LE VESINET (Notre Dame des Champs),68,895000,Notre Dame des Champs,Notre Dame des Champs,
3733,apimo-84671032,Appartement,5,Paris 13e (Bibliothèque - Dunois - Jeanne d'Arc),147,1388000,Bibliothèque - Dunois - Jeanne d'Arc,Bibliothèque,
3778,hektor-ipf-conseils-2233,Appartement,3,Paris 13e (Bièvre Sud - Tolbiac),78,640000,Bièvre Sud - Tolbiac,Bièvre Sud,
4619,hektor-LACLEF2022-907,Appartement,1,75000 Paris (Vaugirard - Parc des Expositions),20,299000,Vaugirard - Parc des Expositions,Vaugirard,
4687,ag750949-445231655,Maison,12,75000 Paris (Pasteur - Montparnasse),500,1500000,Pasteur - Montparnasse,Pasteur,
...,...,...,...,...,...,...,...,...,...
15908,ag755232-450177044,Appartement,7,75116 Paris 16e (Chaillot),175,3200000,Chaillot,Chaillot,
16440,ag751877-439629902,Appartement,3,75116 Paris 16e (Muette-Nord - Trocadéro),87,1930000,Muette-Nord - Trocadéro,Muette-Nord,
16723,immo-facile-55067266,Appartement,6,75116 Paris 16e (Chaillot),166,1921000,Chaillot,Chaillot,
16844,apimo-82263995,Appartement,3,75116 Paris 16e (Dauphine),89,950000,Dauphine,Dauphine,


Puisque les adresse ne sont pas toujours écrite de la meilleur des manières, il nous faut tester plusieures options (dans l'ordre loc1,loc2,loc3) dans geopy pour avoir une coordonnée la plus précise possible.

In [14]:
from geopy.geocoders import Nominatim
from functools import partial
geolocator = Nominatim(user_agent="testing")
geocode = partial(geolocator.geocode, language="fr")
geocode_with_paris = lambda query: geolocator.geocode("%s, Paris FR" % query)

In [15]:
for zone in df_zones.sort_values(by="loc1").itertuples(index=False):
    print(zone)

Pandas(loc1='75001 Paris 1er', loc2=nan, loc3='75001 Paris 1er')
Pandas(loc1='75002 Paris 2e', loc2=nan, loc3='75002 Paris 2e')
Pandas(loc1='75003 Paris 3e', loc2=nan, loc3='75003 Paris 3e')
Pandas(loc1='75004 Paris 4e', loc2=nan, loc3='75004 Paris 4e')
Pandas(loc1='75005 Paris 5e', loc2=nan, loc3='75005 Paris 5e')
Pandas(loc1='75006 Paris 6e', loc2=nan, loc3='75006 Paris 6e')
Pandas(loc1='75007 Paris 7e', loc2=nan, loc3='75007 Paris 7e')
Pandas(loc1='75008 Paris 8e', loc2=nan, loc3='75008 Paris 8e')
Pandas(loc1='75009 Paris 9e', loc2=nan, loc3='75009 Paris 9e')
Pandas(loc1='75010 Paris 10e', loc2=nan, loc3='75010 Paris 10e')
Pandas(loc1='75011 Paris 11e', loc2=nan, loc3='75011 Paris 11e')
Pandas(loc1='75012 Paris 12e', loc2=nan, loc3='75012 Paris 12e')
Pandas(loc1='75013 Paris 13e', loc2=nan, loc3='75013 Paris 13e')
Pandas(loc1='75014 Paris 14e', loc2=nan, loc3='75014 Paris 14e')
Pandas(loc1='75015 Paris 15e', loc2=nan, loc3='75015 Paris 15e')
Pandas(loc1='75016 Paris 16e', loc2=nan, 

In [16]:
coordonnees_adresse={} #Dictionnaire liant un lieu à une coordonnée
dict_zone={} # dicttionnaire liant au mieux du possible le triplet loc1,lo2,lo3 à un triplet lotissement/rue, quartier, arrondissement

#D'abord les arrondissements

zone_arrondissement=df_zones[df_zones["loc1"]==df_zones["loc3"]]

for zone in zone_arrondissement.itertuples(index=False):
    try:
        location = geolocator.geocode(zone[0])
        print(location)
        arrondissement=location.address
        arrondissement_match=re.search(r"75\d+", arrondissement)
        arrondissement= arrondissement_match.group(0) if arrondissement_match else None
        
        
        coordonnees_adresse[arrondissement]=(location.address,location.latitude,location.longitude)
        dict_zone[zone]=(None,None,arrondissement)
    except:
        dict_zone[zone]=(None,None,None)


Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France
Paris 3e Arrondissement, Paris, Île-de-France, France métropolitaine, 75003, France
Paris 4e Arrondissement, Paris, Île-de-France, France métropolitaine, 75004, France
Paris 6e Arrondissement, Paris, Île-de-France, France métropolitaine, 75006, France
Paris 8e Arrondissement, Paris, Île-de-France, France métropolitaine, 75008, France
Paris 9e Arrondissement, Paris, Île-de-France, France métropolitaine, 75009, France
Paris 10e Arrondissement, Paris, Île-de-France, France métropolitaine, 75010, France
Paris 11e Arrondissement, Paris, Île-de-France, France métropolitaine, 75011, France
Paris 12e Arrondissement, Paris, Île-de-France, France métropolitaine, 75012, France
Paris 13e Arrondissement, Paris, Île-de-France, France métropolitaine, 75013, France
Paris 14e Arrondissement, Paris, Île-de-France, France métropolitaine, 75014, France
Paris 15e Arrondissement, Paris, Île-de-France, France métropolitaine,

In [17]:
# Declaring namedtuple()
Pandas = namedtuple('Location', ['loc1', 'loc2', 'loc3'])

# Adding values
L = Pandas('75016 Paris 16', np.nan, '75016 Paris 16')


#Ajout 75016
print(df)
dict_zone[L]=(None,None,'75016')
coordonnees_adresse["75016"]=coordonnees_adresse["75116"]

#Modif 75116
L = Pandas('75116 Paris 16', np.nan, '75116 Paris 16')

dict_zone[L]=(None,None,'75116')



                                     id         type  nb_piece  \
0                        apimo-85473173  Appartement         1   
1            hektor-Les-4-Quartiers-383  Appartement         1   
2          hektor-PARISLUXURYHOMES-4007  Appartement         5   
3                    ag754594-448898324       Studio         1   
5      keller-williams-1-34_1_53-180645  Appartement         2   
...                                 ...          ...       ...   
19720                    apimo-83966258  Appartement         1   
19726                ag755800-412953689  Appartement         1   
19753          century-21-202_480_27885  Appartement         3   
19758   netty-company34605jrw-appt-3212  Appartement         1   
19776                ag933277-381424162  Appartement         1   

                                                 loc  surface     prix  \
0            75001 Paris 1er (Châtelet - Les Halles)       29   364000   
1            75001 Paris 1er (Châtelet - Les Halles)       

In [18]:
dict_zone

{Pandas(loc1='75001 Paris 1er', loc2=nan, loc3='75001 Paris 1er'): (None,
  None,
  '75001'),
 Pandas(loc1='75002 Paris 2e', loc2=nan, loc3='75002 Paris 2e'): (None,
  None,
  None),
 Pandas(loc1='75003 Paris 3e', loc2=nan, loc3='75003 Paris 3e'): (None,
  None,
  '75003'),
 Pandas(loc1='75004 Paris 4e', loc2=nan, loc3='75004 Paris 4e'): (None,
  None,
  '75004'),
 Pandas(loc1='75006 Paris 6e', loc2=nan, loc3='75006 Paris 6e'): (None,
  None,
  '75006'),
 Pandas(loc1='75008 Paris 8e', loc2=nan, loc3='75008 Paris 8e'): (None,
  None,
  '75008'),
 Pandas(loc1='75009 Paris 9e', loc2=nan, loc3='75009 Paris 9e'): (None,
  None,
  '75009'),
 Pandas(loc1='75010 Paris 10e', loc2=nan, loc3='75010 Paris 10e'): (None,
  None,
  '75010'),
 Pandas(loc1='75011 Paris 11e', loc2=nan, loc3='75011 Paris 11e'): (None,
  None,
  '75011'),
 Pandas(loc1='75012 Paris 12e', loc2=nan, loc3='75012 Paris 12e'): (None,
  None,
  '75012'),
 Pandas(loc1='75013 Paris 13e', loc2=nan, loc3='75013 Paris 13e'): (None,
 

In [19]:
coordonnees_adresse

{'75001': ('Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France',
  48.8646144,
  2.334396),
 '75003': ('Paris 3e Arrondissement, Paris, Île-de-France, France métropolitaine, 75003, France',
  48.864212,
  2.360936),
 '75004': ('Paris 4e Arrondissement, Paris, Île-de-France, France métropolitaine, 75004, France',
  48.8562021,
  2.3556193),
 '75006': ('Paris 6e Arrondissement, Paris, Île-de-France, France métropolitaine, 75006, France',
  48.8504333,
  2.3329507),
 '75008': ('Paris 8e Arrondissement, Paris, Île-de-France, France métropolitaine, 75008, France',
  48.8737284,
  2.3103932),
 '75009': ('Paris 9e Arrondissement, Paris, Île-de-France, France métropolitaine, 75009, France',
  48.876019,
  2.339962),
 '75010': ('Paris 10e Arrondissement, Paris, Île-de-France, France métropolitaine, 75010, France',
  48.876225,
  2.3595209),
 '75011': ('Paris 11e Arrondissement, Paris, Île-de-France, France métropolitaine, 75011, France',
  48.858416,
  2.379703

In [20]:
adress=geocode_with_paris("Ternes - Maillot").address

pattern = r"Paris\s\d+er\sArrondissement|Paris\s\d+e\sArrondissement"
matches = re.findall(pattern, adress)
print(adress)
print(matches)

Maillot-Ternes, Paris, Île-de-France, France métropolitaine, 75116, France
[]


In [21]:
#Sauvegarde intermédiaire
coordonnees_adresse_1=coordonnees_adresse.copy() 
dict_zone_1=dict_zone.copy()

In [22]:
# Ensure coordonnees_adresse and dict_zone are initialized properly
coordonnees_adresse = coordonnees_adresse_1.copy()
dict_zone = dict_zone_1.copy()

# Iterate through the sorted zone_lieux DataFrame
for zone in zone_lieux.sort_values(by="loc1").itertuples(index=False):
    try:
        location = None  # Initialize location as None to handle missing addresses
        adresse = None
        quartier_match = None
        arrondissement_match = None

        # Check if zone[0] is already in coordonnees_adresse
        if zone[0] in coordonnees_adresse:
            adresse = coordonnees_adresse[zone[0]][0]
            
        else:
            location = geocode_with_paris(zone[0])
            
            if location is not None:
                coordonnees_adresse[zone[0]] = (location.address, location.latitude, location.longitude)
                adresse = location.address

        # Use regex to extract quartier and arrondissement
        quartier_match = re.search(quartier_pattern, adresse)
        quartier = quartier_match.group(0) if quartier_match else None

        arrondissement_match = re.search(arrondissement_pattern, adresse)
        arrondissement = arrondissement_match.group(0) if arrondissement_match else None

        # Update dict_zone with the extracted information
        dict_zone[zone] = (zone[0], quartier, arrondissement)
        print((zone[0], quartier, arrondissement))

        # Check if we have the quartier and if it's not in coordonnees_adresse
        if quartier and quartier not in coordonnees_adresse:
            location_ = geocode_with_paris(quartier)
            if location_ is None:
                dict_zone[zone] = (zone[0], None, arrondissement)
            else:
                coordonnees_adresse[quartier] = (location_.address, location_.latitude, location_.longitude)

    except Exception as e:
        if "expected string or bytes-like object, got 'Nonetype'" != str(e):
            #Erreur qui n'est pas adresse non trouvée
            print(f"Error processing zone {zone[0]}: {e}")
        # Handle fallback if zone[1] is the alternative name to process
        try:
            if zone[1] in coordonnees_adresse:
                adresse = coordonnees_adresse[zone[1]][0]
            else:
                location = geocode_with_paris(zone[1])

                if location is None:
                    dict_zone[zone] = (None, None, None)
                    print((None, None, None))
                    continue
                else:
                    coordonnees_adresse[zone[1]] = (location.address, location.latitude, location.longitude)
                    adresse = location.address

            # Extract quartier and arrondissement again
            quartier_match = re.search(quartier_pattern, adresse)
            quartier = quartier_match.group(0) if quartier_match else None

            arrondissement_match = re.search(arrondissement_pattern, adresse)
            arrondissement = arrondissement_match.group(0) if arrondissement_match else None

            # Update dict_zone with the information
            dict_zone[zone] = (zone[1], quartier, arrondissement)
            print((zone[1], quartier, arrondissement))

            # Check quartier geocoding if needed
            if quartier and quartier not in coordonnees_adresse:
                location_ = geocode_with_paris(quartier)
                if location_ is None:
                    dict_zone[zone] = (zone[1], None, arrondissement)
                else:
                    coordonnees_adresse[quartier] = (location_.address, location_.latitude, location_.longitude)

        except Exception as e:
            print(f"Error handling fallback zone {zone[1]}: {e}")


NameError: name 'zone_lieux' is not defined

In [None]:
next(iter(coordonnees_adresse.items()))

('75001',
 ('Paris 1er Arrondissement, Paris, Île-de-France, France métropolitaine, 75001, France',
  48.8646144,
  2.334396))

In [None]:
next(iter(dict_zone.items()))

(Pandas(loc1='75001 Paris 1er', loc2=nan, loc3='75001 Paris 1er'),
 (None, None, '75001'))

In [None]:
adress=geocode_with_paris("Muette").address

pattern = r"75\d+"
matches = re.findall(pattern, adress)
print(adress)
print(matches)

GeocoderRateLimited: Non-successful status code 429

In [None]:
#Quel tag n'a uniquement pas d'arrondissement (loc3)
{key: value for key, value in dict_zone.items() if  (value[2] is  None)}

{Pandas(loc1='Auteuil-Nord - Jasmin - Mirabeau', loc2='Auteuil-Nord', loc3='75016 Paris 16e'): ('Auteuil-Nord',
  None,
  None),
 Pandas(loc1='Auteuil-Nord - Jasmin - Mirabeau', loc2='Auteuil-Nord', loc3=nan): ('Auteuil-Nord',
  None,
  None),
 Pandas(loc1='Bièvre Sud - Tolbiac', loc2='Bièvre Sud', loc3='75013 Paris 13e'): ('Bièvre Sud',
  None,
  None),
 Pandas(loc1='Bièvre Sud - Tolbiac', loc2='Bièvre Sud', loc3=nan): ('Bièvre Sud',
  None,
  None),
 Pandas(loc1='Canal Saint-Martin - Jemmapes', loc2='Canal Saint-Martin', loc3='75010 Paris 10e'): ('Canal Saint-Martin',
  None,
  None),
 Pandas(loc1='Chaillot', loc2='Chaillot', loc3=nan): ('Chaillot', None, None),
 Pandas(loc1='Chaillot', loc2='Chaillot', loc3='75016 Paris 16e'): ('Chaillot',
  None,
  None),
 Pandas(loc1='Chaillot', loc2='Chaillot', loc3='75017 Paris 17e'): ('Chaillot',
  None,
  None),
 Pandas(loc1='Hoche Friedland', loc2='Hoche Friedland', loc3='75008 Paris 8e'): (None,
  None,
  None),
 Pandas(loc1='Muette-Nord - T

In [None]:
coordonnees_adresse["Auteuil-Nord"]

("Avenue d'Auteuil, Le Touquet-Paris-Plage, Montreuil-sur-Mer, Pas-de-Calais, Hauts-de-France, France métropolitaine, 62520, France",
 50.513172677628695,
 1.6147425435237512)

In [None]:
print(len(df_zones))
print(len(dict_zone)) 
#Normal car ajout de 75116

223
224


In [None]:
#Save
df_2=df.copy()
df_zones_2=df_zones.copy()
dict_zone_2=dict_zone.copy()
coordonnees_adresse_2=coordonnees_adresse.copy()

In [None]:
df=df_2.copy()
df_zones=df_zones_2.copy()
dict_zone=dict_zone_2.copy()
coordonnees_adresse=coordonnees_adresse_2.copy()

# Iterate over the dictionary and suprres the first value
for key, value in coordonnees_adresse.items():
    if isinstance(value, tuple) and len(value) == 3:
        coordonnees_adresse[key] = value[1:]  # Replace (a, b, c) with (b, c)

next(iter(coordonnees_adresse.items()))

('75001', (48.8646144, 2.334396))

In [None]:
import pandas as pd
from collections import namedtuple

LocationKey = namedtuple('Pandas', ['loc1', 'loc2', 'loc3'])

# Step 6: Define a function to retrieve coordinates from the dictionary
def get_coordinates(row):
    try:
        # Ensure the row contains the correct columns
#       print((row['loc1'], row['loc2'], row['loc3']))
        loc1 = None if pd.isna(row['loc1']) else row['loc1']
        loc2 = None if pd.isna(row['loc2']) else row['loc2']
        loc3 = None if pd.isna(row['loc3']) else row['loc3']
        loc_key = LocationKey(loc1=loc1, loc2=loc2, loc3=loc3)
        
        # Return coordinates from dict_zone or (None, None, None) if not found
        return dict_zone.get(loc_key, (None, None, None))
    except KeyError as e:
        print(f"KeyError: {e}")  # Debugging step
        return (None, None, None)

# Step 7: Apply the function to each row of the DataFrame to add the (x, y, z) coordinates
df[['loc1', 'loc2', 'loc3']] = df.apply(get_coordinates, axis=1).apply(pd.Series)

# Resulting DataFrame with (x, y, z) columns
df

Unnamed: 0,id,Type,Nb_piece,loc,surface,prix,loc1,loc2,loc3
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000,Châtelet - Les Halles,Quartier Les Halles,75001
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000,Châtelet - Les Halles,Quartier Les Halles,75001
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000,Palais Royal,Quartier du Palais Royal,75001
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000,Saint-Germain,,75006
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000,Vendôme,Quartier Vendôme,75001
...,...,...,...,...,...,...,...,...,...
9762,apimo-6951400,Appartement,4,75020 Paris 20e (Plaine - Lagny),89,599000,Plaine,Quartier de Charonne,75020
9763,apimo-6810785,Appartement,2,75020 Paris 20e (Gambetta),40,302000,Gambetta,Quartier du Père-Lachaise,75020
9764,immo-facile-49166437,Appartement,3,75020 Paris 20e (Père Lachaise - Réunion),67,769000,Père Lachaise,Quartier du Père-Lachaise,75020
9765,immo-facile-48762452,Appartement,3,75020 Paris 20e (Télégraphe - Pelleport - Sain...,57,519000,Télégraphe,Quartier d'Amérique,75019


In [None]:
# Step 6: Define a function to retrieve coordinates from the dictionary
def get_coordinates_2(row, loc):
    try:
        # Ensure the row contains the correct columns
#       print((row['loc1'], row['loc2'], row['loc3']))
        locx= None if pd.isna(row[loc]) else row[loc]
        
        # Return coordinates from dict_zone or (None, None, None) if not found
        return coordonnees_adresse.get(locx, (None, None))
    except KeyError as e:
        print(f"KeyError: {e}")  # Debugging step
        return (None, None)

# Step 7: Apply the function to each row of the DataFrame to add the (x, y, z) coordinates
df[['loc1_x', 'loc1_y']] = df.apply(lambda row: pd.Series(get_coordinates_2(row, "loc1")), axis=1)
df[['loc2_x', 'loc2_y']] = df.apply(lambda row: pd.Series(get_coordinates_2(row, "loc2")), axis=1)
df[['loc3_x', 'loc3_y']] = df.apply(lambda row: pd.Series(get_coordinates_2(row, "loc3")), axis=1)

# Resulting DataFrame with (x, y) columns
df

Unnamed: 0,id,Type,Nb_piece,loc,surface,prix,loc1,loc2,loc3,loc1_x,loc1_y,loc2_x,loc2_y,loc3_x,loc3_y
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000,Châtelet - Les Halles,Quartier Les Halles,75001,48.862509,2.346443,48.862373,2.345313,48.864614,2.334396
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000,Châtelet - Les Halles,Quartier Les Halles,75001,48.862509,2.346443,48.862373,2.345313,48.864614,2.334396
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000,Palais Royal,Quartier du Palais Royal,75001,48.863585,2.336204,48.864603,2.336049,48.864614,2.334396
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000,Saint-Germain,,75006,48.856553,2.333331,,,48.850433,2.332951
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000,Vendôme,Quartier Vendôme,75001,48.866900,2.328549,48.866900,2.328549,48.864614,2.334396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9762,apimo-6951400,Appartement,4,75020 Paris 20e (Plaine - Lagny),89,599000,Plaine,Quartier de Charonne,75020,48.850152,2.402190,48.855145,2.397638,48.865042,2.398929
9763,apimo-6810785,Appartement,2,75020 Paris 20e (Gambetta),40,302000,Gambetta,Quartier du Père-Lachaise,75020,48.864933,2.398054,48.863752,2.395302,48.865042,2.398929
9764,immo-facile-49166437,Appartement,3,75020 Paris 20e (Père Lachaise - Réunion),67,769000,Père Lachaise,Quartier du Père-Lachaise,75020,48.861130,2.394009,48.863752,2.395302,48.865042,2.398929
9765,immo-facile-48762452,Appartement,3,75020 Paris 20e (Télégraphe - Pelleport - Sain...,57,519000,Télégraphe,Quartier d'Amérique,75019,48.875496,2.398965,48.882048,2.394619,48.889343,2.384360


In [None]:
# Supprimer les lignes sans arrondissement
df_cleaned = df.dropna(subset=['loc3'], how='all')
print(len(df_cleaned))

7809


In [None]:
pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
Installing collected packages: dill
Successfully installed dill-0.3.9
Note: you may need to restart the kernel to use updated packages.


In [None]:
#Save dictionnary in case of

import dill as pickle
with open("coordonnees_adresse.pkl", "wb") as file:
    pickle.dump(coordonnees_adresse, file)
with open("dict_zone.pkl", "wb") as file:
    pickle.dump(dict_zone, file)    

ModuleNotFoundError: No module named 'dill'

In [None]:
import s3fs
import pandas as pd

# Initialize the S3 filesystem with your endpoint
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

# Define your bucket and target path
MY_BUCKET = "jhajjar"
target_path = f"{MY_BUCKET}/Diffusion/clean_data.csv"

# Assuming 'df' is your DataFrame
try:
    # Open the target path in write mode using s3fs
    with fs.open(target_path, 'w') as f:
        # Write the DataFrame directly to S3 as a CSV
        df_cleaned.to_csv(f, index=False)
    print(f"DataFrame successfully uploaded to {target_path}")
except Exception as e:
    print(f"Error uploading DataFrame: {e}")


Error uploading DataFrame: name 'df_cleaned' is not defined
