In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance
from tqdm import tqdm

In [2]:
page = urllib3.PoolManager().request('GET', 'https://www.auditionconseil.fr/recherche-centre/')
soup = BeautifulSoup(page.data, features='html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="fr-FR">
 <head>
  <meta charset="utf-8"/>
  <title>
   Audioprothésistes, centres auditifs en France
  </title>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1" name="viewport"/>
  <link href="https://www.auditionconseil.fr/wp-content/themes/audition_conseil/img/icons/favicon_new.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="https://www.auditionconseil.fr/wp-content/themes/audition_conseil/img/icons/apple-touch-icon.png" rel="apple-touch-icon"/>
  <link href="https://www.auditionconseil.fr/wp-content/themes/audition_conseil/img/icons/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="https://www.auditionconseil.fr/wp-content/themes/audition_conseil/img/icons/apple-touch-icon-72x72.png" rel="apple-touch-icon" sizes="72x72"/>
  <link href="https://www.auditionconseil.fr/wp-content/themes/audition_co

In [3]:
names_list = []
address_list = []
region_links = [tag.get('href') for tag in soup.find_all('a', {'class':"region_link"})]

for link in tqdm(region_links):
    page = urllib3.PoolManager().request('GET', link)
    soup = BeautifulSoup(page.data, features='html.parser')
    province_links = [tag.get('href') for tag in soup.find_all('a', {'class':"region_link"})]

    for link2 in province_links:
        page = urllib3.PoolManager().request('GET', link2)
        soup = BeautifulSoup(page.data, features='html.parser')
        shop_links = [tag.get('href') for tag in soup.find_all('a', {'class':"centre_link"})]
        
        for shop in shop_links:
            page = urllib3.PoolManager().request('GET', shop)
            soup = BeautifulSoup(page.data, features='html.parser')

            name_comps = [elem.get_text().strip(' \n ').split('\n') for elem in soup.find_all('h1', {'itemprop':"name"})][0]
            name = ' '.join([elem.strip() for elem in name_comps])
            address = [elem.get_text().strip('\n').replace('\n', ' ') for elem in soup.find_all('div', {'class':"adresse"})][0]

            names_list.append(name)
            address_list.append(address)

100%|██████████| 18/18 [13:35<00:00, 45.29s/it] 


In [4]:
df = pd.DataFrame({'shop_name': names_list, 'address': address_list})
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,shop_name,address
0,Correction Auditive à Bischwiller,3 Place de la Liberté 67240 Bischwiller
1,Laboratoire auditif à Molsheim,8B Allée Jean Pierre Carl 67120 Molsheim
2,Vos audioprothésistes de Obernai,8 Rue du Général Leclerc 67210 Obernai Réside...
3,Laboratoire audition à Schirmeck,5 Place du Bergopré 67130 Schirmeck
4,Centre auditif à Strasbourg Robertsau,96 Rue Boecklin 67000 Strasbourg
...,...,...
315,Audioprothésiste à Ajaccio Napoléon,Audition Conseil A...
316,Audioprothésiste à Ajaccio Stiletto,Pôle d’activité du Stiletto 20090 Ajaccio Pôl...
317,Audioprothésiste à Bastia – Rue Santa Madalena,Audition Conseil\r...
318,Audioprothésiste à Bastia – Boulevard Paoli,26 boulevard Paoli 20200 BASTIA


In [5]:
def adjust_address(str):
    str = str.strip().lower().replace('\xa0', ' ')
    str = ' '.join(str.split())
    if str.startswith('audition conseil'):
        out = ''.join(str.split('\r')[1:])
    else:
        out = str
    return out

In [6]:
df['address'] = df['address'].apply(adjust_address)

In [7]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/320
100/320
150/320
200/320
250/320
300/320


In [8]:
# check
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,Correction Auditive à Bischwiller,3 place de la liberté 67240 bischwiller,48.767795,7.856959,Bischwiller,67240
1,Laboratoire auditif à Molsheim,8b allée jean pierre carl 67120 molsheim,48.539357,7.491551,Molsheim,67120
2,Vos audioprothésistes de Obernai,8 rue du général leclerc 67210 obernai résiden...,48.46317,7.488554,Obernai,67210
3,Laboratoire audition à Schirmeck,5 place du bergopré 67130 schirmeck,48.480718,7.219577,Schirmeck,67130
4,Centre auditif à Strasbourg Robertsau,96 rue boecklin 67000 strasbourg,48.598584,7.777689,Strasbourg,67000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  320 non-null    object 
 1   address    320 non-null    object 
 2   latitude   289 non-null    float64
 3   longitude  289 non-null    float64
 4   city       289 non-null    object 
 5   postcode   289 non-null    object 
dtypes: float64(2), object(4)
memory usage: 15.1+ KB


In [10]:
df.to_csv('../output/auditionconseil_shops_geocoded.csv', index=False)