In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
# parse web page with links to lists of amplifon shops
page = urllib3.PoolManager().request('GET', 'https://www.amplifon.com/fr/nous-trouver')
soup = BeautifulSoup(page.data, features="html.parser")

In [1]:
#print(soup.prettify())

In [4]:
raw_links = soup.find_all('tr')
links = [elem.find('a')['href'] for elem in raw_links]
links

['https://www.amplifon.com/fr/nous-trouver/audioprothesiste-auvergne-rhone-alpes',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-bourgogne-franche-comte',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-bretagne',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-centre-val-de-loire',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-grand-est',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-hauts-de-france',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-ile-de-france',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-normandie',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-nouvelle-aquitaine',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-occitanie',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-pays-de-la-loire',
 'https://www.amplifon.com/fr/nous-trouver/audioprothesiste-provence-alpes-cote-d-azur-corse']

In [5]:
# scrape each link
df_list = []

for link in links:
    
    page = urllib3.PoolManager().request('GET', link)
    soup = BeautifulSoup(page.data, features="html.parser")

    shop_name = [x.get_text().strip() for x in soup.find_all('span', class_ = 'item-h4 value-bold pb-10-xs')]
    address = [x.get_text().strip() for x in soup.find_all('p', class_ = 'body-copy grey-text pb-10-xs')]
    
    df = pd.DataFrame({'shop_name': shop_name, 'address': address})
    df_list.append(df)

df = pd.concat(df_list, axis=0).reset_index(drop=True)

In [6]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/710
100/710
150/710
200/710
250/710
300/710
350/710
400/710
450/710
500/710
550/710
600/710
650/710
700/710


In [7]:
# check
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,Amplifon Audioprothésiste Aix les Bains,11 avenue De Verdun 73100 Aix les Bains,45.690718,5.910551,Aix-les-Bains,73100
1,Amplifon Audioprothésiste Albertville,57 rue de la République 73200 Albertville,45.676216,6.390093,Albertville,73200
2,Amplifon Audioprothésiste Ambérieu,18 rue Alexandre Bérard 01500 Ambérieu en Bugey,45.958924,5.358467,Ambérieu-en-Bugey,1500
3,Amplifon Audioprothésiste Ambert,2 rue de la République 63600 Ambert,45.549808,3.740725,Ambert,63600
4,Amplifon Audioprothésiste Amilly,802 C Avenue d’Antibes 45200 Amilly,47.980067,2.732994,Amilly,45200


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  710 non-null    object 
 1   address    710 non-null    object 
 2   latitude   710 non-null    float64
 3   longitude  710 non-null    float64
 4   city       710 non-null    object 
 5   postcode   710 non-null    object 
dtypes: float64(2), object(4)
memory usage: 33.4+ KB


In [9]:
df.to_csv('../output/amplifon_shops_geocoded.csv', index=False)