In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
page = urllib3.PoolManager().request('GET', 'https://centre.auditionsante.fr/index.html', headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.data, features='html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html id="html" lang="fr">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <link href="//www.yext-pixel.com" rel="dns-prefetch"/>
  <link href="//a.cdnmktg.com" rel="dns-prefetch"/>
  <link href="//a.mktgcdn.com" rel="dns-prefetch"/>
  <link href="//dynl.mktgcdn.com" rel="dns-prefetch"/>
  <link href="//dynm.mktgcdn.com" rel="dns-prefetch"/>
  <link href="//www.google-analytics.com" rel="dns-prefetch"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="//dynl.mktgcdn.com/p/arR32-aAF-PzG86sJtDT2DjDkMd-_bG0dy2URhG63FE/32x32.png" rel="shortcut icon"/>
  <meta content="Venez rencontrer les audioprothésistes Audition Santé et leur équipes. Nous répondrons à toutes vos questions sur les acouphènes, les aides auditives et les bilans auditifs. Contactez-nous dès m

In [3]:
names = [elem.get_text() for elem in soup.find_all('span', {'class':"LocationName-brand"})]
addresses = [elem.get_text() for elem in soup.find_all('address', {'class':"c-address"})]

df = pd.DataFrame({'shop_name': names, 'address': addresses})
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,shop_name,address
0,Audioprothésiste Abbeville Audition Santé,7 rue du Pont d'Amour 80100 ABBEVILLE FR
1,Audioprothésiste Amiens Audition Santé,106 rue Alexandre DUMAS 80000 AMIENS FR
2,Audioprothésiste ARRAS Pasteur Audition Santé,15 rue Pasteur 62000 ARRAS FR
3,Audioprothésiste ARRAS Gambetta Audition Santé,39 rue Gambetta 62000 ARRAS FR
4,Audioprothésiste Aixe-sur-Vienne Audition Santé,2 rue Gambetta 87700 Aixe-sur-Vienne FR
...,...,...
276,Audioprothésiste Villeneuve-lez-Avignon Auditi...,128 avenue du Général Leclerc\nRond-Point Bell...
277,Audioprothésiste Vitrolles Audition Santé,Place de la Victoire 13127 Vitrolles FR
278,Audioprothésiste Vénissieux Audition Santé,136 boulevard Irène Joliot - Centre Commercial...
279,Audioprothésiste Wimereux Audition Santé,90 ter rue Carnot 62930 Wimereux FR


In [4]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/281
100/281
150/281
200/281
250/281


In [5]:
# check
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,Audioprothésiste Abbeville Audition Santé,7 rue du Pont d'Amour 80100 ABBEVILLE FR,50.104848,1.832333,Abbeville,80100
1,Audioprothésiste Amiens Audition Santé,106 rue Alexandre DUMAS 80000 AMIENS FR,49.873833,2.289563,Amiens,80000
2,Audioprothésiste ARRAS Pasteur Audition Santé,15 rue Pasteur 62000 ARRAS FR,50.289046,2.780821,Arras,62000
3,Audioprothésiste ARRAS Gambetta Audition Santé,39 rue Gambetta 62000 ARRAS FR,50.289272,2.775238,Arras,62000
4,Audioprothésiste Aixe-sur-Vienne Audition Santé,2 rue Gambetta 87700 Aixe-sur-Vienne FR,45.795714,1.135355,Aixe-sur-Vienne,87700


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281 entries, 0 to 280
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  281 non-null    object 
 1   address    281 non-null    object 
 2   latitude   281 non-null    float64
 3   longitude  281 non-null    float64
 4   city       281 non-null    object 
 5   postcode   281 non-null    object 
dtypes: float64(2), object(4)
memory usage: 13.3+ KB


In [7]:
df.to_csv('../output/auditionsante_shops_geocoded.csv', index=False)