In [13]:
import pandas as pd
df = pd.read_csv("listings.csv", names=["id", "Type", "Nb_chambre", "loc", "surface", "prix"], header=0).drop_duplicates()

In [14]:
#Handle the type
df['id'] = df['id'].astype(str)
df['Type'] = df['Type'].astype(str)
df['Nb_chambre'] = pd.to_numeric(df['Nb_chambre'], errors='coerce').astype('Int64')
df['loc'] = df['loc'].astype(str)
df['surface'] = pd.to_numeric(df['surface'], errors='coerce').astype('Int64')
df['prix'] =pd.to_numeric(df['prix'].str.replace(' ', '').str.replace('€', ''), errors='coerce').astype('Int64')


In [15]:
df.dtypes

id            object
Type          object
Nb_chambre     Int64
loc           object
surface        Int64
prix           Int64
dtype: object

In [16]:
#Gerer les NAN
df.loc[(df['Type'] == 'Studio') & (pd.isnull(df['Nb_chambre'])), 'Nb_chambre'] = 1
df

Unnamed: 0,id,Type,Nb_chambre,loc,surface,prix
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000
...,...,...,...,...,...,...
9762,apimo-6951400,Appartement,4,75020 Paris 20e (Plaine - Lagny),89,599000
9763,apimo-6810785,Appartement,2,75020 Paris 20e (Gambetta),40,302000
9764,immo-facile-49166437,Appartement,3,75020 Paris 20e (Père Lachaise - Réunion),67,769000
9765,immo-facile-48762452,Appartement,3,75020 Paris 20e (Télégraphe - Pelleport - Sain...,57,519000


In [17]:
#Supprimer les lignes restantes avec des NaN
df=df.dropna()

In [18]:
df

Unnamed: 0,id,Type,Nb_chambre,loc,surface,prix
0,apimo-85473173,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),29,364000
1,hektor-Les-4-Quartiers-383,Appartement,1,75001 Paris 1er (Châtelet - Les Halles),23,301000
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,75002 Paris 2e (Palais Royal),106,2490000
3,ag754594-448898324,Studio,1,75001 Paris 1er (Saint-Germain - L'Auxerrois),27,365000
5,keller-williams-1-34_1_53-180645,Appartement,2,75001 Paris 1er (Vendôme),67,1190000
...,...,...,...,...,...,...
9762,apimo-6951400,Appartement,4,75020 Paris 20e (Plaine - Lagny),89,599000
9763,apimo-6810785,Appartement,2,75020 Paris 20e (Gambetta),40,302000
9764,immo-facile-49166437,Appartement,3,75020 Paris 20e (Père Lachaise - Réunion),67,769000
9765,immo-facile-48762452,Appartement,3,75020 Paris 20e (Télégraphe - Pelleport - Sain...,57,519000


##### Uniformiser le format des localisation
df_ = df.copy()
df_[['code_postal', 'parentheses']] = df_['loc'].str.extract(r'(\d{5}).*?\((.*?)\)')
df_['concat'] = df_['code_postal'] + ' ' + df_['parentheses'].fillna('')
df_['loc'] = df_['concat'].where(df_['parentheses'].notna(), df_['loc'])
df_.drop(columns=['code_postal', 'parentheses', 'concat'], inplace=True)

df_[['code_postal', 'parentheses']] = df_['loc'].str.extract(r'(\d{1,2})e.*?\((.*?)\)')
df_['concat'] = '750'+df_['code_postal'][:-2] + ' ' + df_['parentheses'].fillna('')
df_['loc'] = df_['concat'].where(df_['parentheses'].notna(), df_['loc'])
df_.drop(columns=['code_postal', 'parentheses', 'concat'], inplace=True)


df_=df_["loc"].drop_duplicates()
df_

In [19]:
# Uniformiser le format des localisation
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)
df_ = df.copy()
df_[['parentheses']] = df_['loc'].str.extract(r'\((.*?)\)')
df_['concat'] = df_['parentheses'].fillna('')
df_['loc'] = df_['concat'].where(df_['parentheses'].notna(), df_['loc'])
df_.drop(columns=['parentheses', 'concat'], inplace=True)

df_[['parentheses']] = df_['loc'].str.extract(r'^(.*?)\s*\s-')
df_['concat'] = df_['parentheses'].fillna('')
df_['loc'] = df_['concat'].where(df_['parentheses'].notna(), df_['loc'])
df_.drop(columns=['parentheses', 'concat'], inplace=True)

df_zones=df_["loc"].drop_duplicates().copy()
df=df_.copy()

In [20]:
from geopy.geocoders import Nominatim
from functools import partial
geolocator = Nominatim(user_agent="testing")
geocode = partial(geolocator.geocode, language="fr")
geocode_with_paris = lambda query: geolocator.geocode("%s, Paris FR" % query)

In [21]:
for zone in df_zones:
    print(zone)

Châtelet
Palais Royal
Saint-Germain
Vendôme
Gaillon
Saint-Thomas d'Aquin
75001 Paris 1er
Sentier
Montorgueil
75002 Paris 2e
Arts-et-Métiers
Sainte-Avoye
Archives
75003 Paris 3e
Enfants Rouges
République
Saint-Merri
Arsenal
75004 Paris 4e
Les Iles
Saint-Gervais
75000 Paris
Monge
Jardin des Plantes
Val de Grâce
Sorbonne
Rennes
Montparnasse
Saint-Germain des Près
Odéon
Saint-Placide
Notre Dame des Champs
75006 Paris 6e
Saint-Michel
Hoche Friedland
Elysée
Monceau
Triangle d'Or
Europe
Saint-Philippe du Roule
Mairie
75008 Paris 8e
Clichy
Grands Boulevards
Lorette
Opéra
Trudaine
Porte Saint-Martin
75009 Paris 9e
Saint-Vincent de Paul
Château d'Eau
Goncourt
Canal Saint-Martin
Louis Blanc
Belleville
75010 Paris 10e
Nation
Léon-Blum
75011 Paris 11e
Bastille
75012 Paris 12e
Bel-Air
Aligre
Vallée de Fecamp
Jardin de Reuilly
Bercy
Olympiades
Croulebarbe
Bièvre Sud
Patay
Nationale
Salpétrière
Butte-aux-Cailles
Bibliothèque
75013 Paris 13e
94200 Ivry-sur-Seine
Pernety
Jean Moulin
Didot
Montsouris
750

In [22]:
#Gerer les champs non conventionnels
df.loc[df['loc'].str.contains('Châtelet - Les Halles', case=False, na=False), 'loc'] = '75001 Châtelet - Les Halles'
df.loc[df['loc'].str.contains('Aligre - Gare de Lyon', case=False, na=False), 'loc'] = '75012 Aligre - Gare de Lyon'


In [23]:
df_=df["loc"].drop_duplicates()
df_['pattern'] = df_.str.extract(r'\d{5}\s(.*)')
pattern_counts = df_['pattern'].value_counts()
pattern_counts[pattern_counts > 1]

0        
Paris 16e    2
Name: count, dtype: int64

In [24]:
df_=df["loc"].loc[df['loc'].str.contains(r'\(', case=False, na=False)]
df_['pattern'] = df_.str.extract(r'\((.*?)\)')
pattern_counts = df_['pattern'].value_counts()
pattern_counts[pattern_counts > 1]

Series([], Name: count, dtype: int64)

In [25]:
non_matching_rows = df[~df['loc'].str.contains(r'\d{5}.*?\(.*?\)', na=False, regex=True)]
non_matching_rows["loc"].drop_duplicates()

0              Châtelet
2          Palais Royal
3         Saint-Germain
5               Vendôme
14              Gaillon
             ...       
9098    75020 Paris 20e
9107       Ménilmontant
9121      Père Lachaise
9124             Plaine
9141       Saint-Blaise
Name: loc, Length: 140, dtype: object

In [26]:
df

Unnamed: 0,id,Type,Nb_chambre,loc,surface,prix
0,apimo-85473173,Appartement,1,Châtelet,29,364000
1,hektor-Les-4-Quartiers-383,Appartement,1,Châtelet,23,301000
2,hektor-PARISLUXURYHOMES-4007,Appartement,5,Palais Royal,106,2490000
3,ag754594-448898324,Studio,1,Saint-Germain,27,365000
5,keller-williams-1-34_1_53-180645,Appartement,2,Vendôme,67,1190000
...,...,...,...,...,...,...
9762,apimo-6951400,Appartement,4,Plaine,89,599000
9763,apimo-6810785,Appartement,2,Gambetta,40,302000
9764,immo-facile-49166437,Appartement,3,Père Lachaise,67,769000
9765,immo-facile-48762452,Appartement,3,Télégraphe,57,519000


In [29]:
df.to_csv("post_processed_listings.csv", index=False)


In [30]:
import s3fs

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

MY_BUCKET = "jhajjar"
target_path = f"{MY_BUCKET}/Diffusion/"
try:
    fs.put("/home/onyxia/work/Projet_datascience_ensae/post_processed_listings.csv", target_path)
    print(f"File uploaded to {target_path}")
except Exception as e:
    print(f"Error uploading file: {e}")

File uploaded to jhajjar/Diffusion/
