In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance
from tqdm import tqdm
from itertools import compress

In [2]:
page = urllib3.PoolManager().request('GET', 'https://www.laboratoires-unisson.com/centres-auditifs', headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.data, features='html.parser')

print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 9]><html class="no-js no-svg ie lt-ie9 lt-ie8 lt-ie7" lang="fr-FR"> <![endif]-->
<!--[if IE 9]><html class="no-js no-svg ie ie9 lt-ie9 lt-ie8" lang="fr-FR"> <![endif]-->
<!--[if gt IE 9]><!-->
<html class="no-js no-svg" lang="fr-FR">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <link href="https://www.laboratoires-unisson.com/wp-content/themes/unisson/dist/css/main.css" media="screen" rel="stylesheet" type="text/css"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.eu01.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"468585500",accountID:"3270769",trustKey:"3270769",xpid:"VwQAUVFVARAEUllWDwgBUVY=",licenseKey:"NRJS-d3dcc51cd8730908341",applicationID:"468489672"};;(()=>{var e,t,r={9071:(e,t,r)=>{

In [3]:
all_links = [elem.get('href') for elem in soup.find_all('a')]
mask = [link.startswith('https://www.laboratoires-unisson.com/centres-auditifs/') for link in all_links]
links = list(compress(all_links, mask))

In [4]:
names = []
addresses = []

for link in tqdm(links):
    
    page = urllib3.PoolManager().request('GET', link, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(page.data, features='html.parser')
    store_item = soup.find('div', {'class':"inside-content"})
    
    names.append(store_item.find('p', {'class':"content-title"}).get_text())
    addresses.append(store_item.find('p', {'class':"text-content"}).get_text().replace('\n', ' '))

df = pd.DataFrame({'shop_name': names, 'address': addresses})
df

100%|██████████| 40/40 [02:35<00:00,  3.89s/it]


Unnamed: 0,shop_name,address
0,Unisson Bordeaux,28 allées d'Orléans 33000 Bordeaux
1,Unisson Lille,"9-11 rue Léon Trulin 59000, Lille"
2,Unisson Lyon,"3, cours Charlemagne Entrée place des Archives..."
3,Maison de l'Appareil Auditif Lyon : partenaire...,"31 cours Lafayette, 69006 Lyon"
4,Unisson Marseille,"86 rue de Rome, 13006 Marseille"
5,Unisson Nice,148 avenue Guynemer Entrée sous l'arche - 2ème...
6,Unisson Paris 8 - Saint-Lazare,"61 rue de l’arcade entrée sur rue, via le nouv..."
7,Maison de l'Appareil Auditif Paris 16 : parten...,156 avenue de Versailles 75016 Paris
8,Maison de l'Appareil Auditif Paris 20 - Nation...,36 boulevard de Charonne 75020 Paris
9,Unisson Toulouse,1 rue Gabriel Péri Le Télégramme - 2ème étage ...


In [5]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  40 non-null     object 
 1   address    40 non-null     object 
 2   latitude   40 non-null     float64
 3   longitude  40 non-null     float64
 4   city       40 non-null     object 
 5   postcode   40 non-null     object 
dtypes: float64(2), object(4)
memory usage: 2.0+ KB


In [7]:
df.to_csv('../output/unisson_shops_geocoded.csv', index=False)