In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
page = urllib3.PoolManager().request('GET', 'https://audition.optical-center.fr/fr?page=2')
soup = BeautifulSoup(page.data, features="html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="fr">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="HgeEavIQVdE8GJw1FWs56DHVvN-zm9HAP6a7NoZ8sJ8" name="google-site-verification"/>
  <title>
   Centre audition France : appareils et prothèses auditives - Page 2 de 35
  </title>
  <meta content="Centre audition France : appareils et prothèses auditives - Page 2 de 35" name="title"/>
  <meta content="Optical Center centre audition France : appareils auditifs et prothèses auditives. Trouvez l'aide auditive adaptée à vos troubles de l'audition parmi les plus grandes marques de prothèse auditive : Siemens, Oticon, Widex... - Page 2 de 35" name="description"/>
  <link href="https://audition.optical-center.fr/fr" rel="prev"/>
  <link href="https://audition.optical-center.fr/fr?page=3" rel="next"/>
  <link href="https://audition.optical-center.fr/fr?page=2" hreflang="x-defaul

In [3]:
def address_from_tag(address_tags):
    try:
        tags = address_tags.find_all('div')
        address_components = [elem.contents[0].replace(u'\xa0', u' ').strip(' ') for elem in tags[1:]]
        address = ' '.join(address_components)
    except:
        address = np.nan
    return address

In [4]:
main_link_pre = 'https://audition.optical-center.fr/fr?page='
df_list = []

for i in range(1, 36):
    
    if (i%5==0):
        print(f'Scraping page {i}/35...')

    main_link = main_link_pre + str(i)
    page = urllib3.PoolManager().request('GET', main_link)
    soup = BeautifulSoup(page.data, features="html.parser")

    tags = soup.find_all('div', {'class':"lf-location-default__content__main__container lf-geo-divisions__main__locations__list__location__content__main__container"})
    names = []
    addresses = []
    for tag in tags:
        names.append(tag.find('h2').find('span').contents[0])
        addresses.append(address_from_tag(tag.find('address', {'class':"lf-parts-address lf-location-default__content__main__container__address lf-geo-divisions__main__locations__list__location__content__container__address"})))

    df_list.append(pd.DataFrame({'shop_name': names, 'address': addresses}))

df = pd.concat(df_list)

Scraping page 5/35...
Scraping page 10/35...
Scraping page 15/35...
Scraping page 20/35...
Scraping page 25/35...
Scraping page 30/35...
Scraping page 35/35...


In [5]:
mask = df['shop_name'].str.startswith('Audioprothésiste')
df = df[mask].reset_index(drop=True)
df.head()

Unnamed: 0,shop_name,address
0,Audioprothésiste CHAMANT Optical Center,"6, Avenue du Poteau 60300 Chamant"
1,Audioprothésiste CHÂTEAURENARD Optical Center,"698, Boulevard Ernest Genevet 13160 Chateaurenard"
2,Audioprothésiste AUBIÈRE Optical Center,127 Avenue de Cournon 63170 Aubière
3,Audioprothésiste DRANCY Optical Center,"140, Avenue Henri Barbusse 93700 Drancy"
4,Audioprothésiste DINARD - PLEURTUIT Optical C...,ZAC de la Ville Es Meniers 35730 Pleurtuit


In [6]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/596
100/596
150/596
200/596
250/596
300/596
350/596
400/596
450/596
500/596
550/596


In [7]:
# check
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,Audioprothésiste CHAMANT Optical Center,"6, Avenue du Poteau 60300 Chamant",49.216698,2.595482,Chamant,60300
1,Audioprothésiste CHÂTEAURENARD Optical Center,"698, Boulevard Ernest Genevet 13160 Chateaurenard",43.887916,4.85368,Châteaurenard,13160
2,Audioprothésiste AUBIÈRE Optical Center,127 Avenue de Cournon 63170 Aubière,45.755981,3.149182,Aubière,63170
3,Audioprothésiste DRANCY Optical Center,"140, Avenue Henri Barbusse 93700 Drancy",48.920326,2.452644,Drancy,93700
4,Audioprothésiste DINARD - PLEURTUIT Optical C...,ZAC de la Ville Es Meniers 35730 Pleurtuit,48.614237,-2.060204,Pleurtuit,35730


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  596 non-null    object 
 1   address    596 non-null    object 
 2   latitude   596 non-null    float64
 3   longitude  596 non-null    float64
 4   city       596 non-null    object 
 5   postcode   596 non-null    object 
dtypes: float64(2), object(4)
memory usage: 28.1+ KB


In [9]:
df.to_csv('../output/opticalcenter_shops_geocoded.csv', index=False)