In [1]:
import urllib3
from bs4 import BeautifulSoup
from itertools import compress
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim, IGNFrance, BANFrance

In [2]:
page = urllib3.PoolManager().request('GET', 'https://www.alainafflelou-acousticien.fr/audioprothesiste/france')
soup = BeautifulSoup(page.data, features="html.parser")

In [3]:
print(soup.prettify())

<!DOCTYPE html>
<html class="c-page js-page c-page--userDisconnected" lang="fr">
 <head>
  <meta charset="utf-8"/>
  <title>
   Audioprothésistes ALAIN AFFLELOU Acousticien France
  </title>
  <meta content="    Vous recherchez un Opticien AFFLELOU en france ? Localisez l\'Opticien Afflelou le plus proche avec ses horaires, ses coordonnées et toutes ses infos pratiques.
" name="description"/>
  <link href="https://www.alainafflelou-acousticien.fr/audioprothesiste/france" rel="canonical"/>
  <meta content="index,follow" name="robots"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Alain Afflelou Acousticien" property="og:title"/>
  <meta content="https://www.alainafflelou-acousticien.fr/img/social/aaa-share.jpg" property="og:image"/>
  <link href="/main.18ed2c6f.css" rel="stylesheet"/>
  <link href="/favicon.ico" rel="icon" type="image/x-icon"/>
  <link href="https://fonts.gstatic.com" rel="preconnect"/>
  <link href="https://fonts.googleapis

In [4]:
links = []
flags = []

shops = soup.find_all('li', {'class':'c-list__element mt-0'})
for shop in shops:
    links.append(shop.a.get('href'))
    try:
        flg = shop.span.contents[0] is not None
    except AttributeError:
        flg = False
    flags.append(flg)

In [5]:
def find_address(link, flag):
    if flag:
        page = urllib3.PoolManager().request('GET', link)
        soup = BeautifulSoup(page.data, features="html.parser")
        city = link.split('/')[-1]

        tags = soup.find_all('div', {'class':'d-flex flex-column fw-bold'})
        address = [t.contents[0].strip('\n').strip(' ').strip('\n')+f' {city.upper()} FR' for t in tags]

    if not flag:
        page = urllib3.PoolManager().request('GET', link)
        soup = BeautifulSoup(page.data, features="html.parser")

        tags = soup.find_all('span')
        mask = [elem.get('itemprop') for elem in tags]
        address_components = [elem.contents[0].strip('\n').strip() for elem in compress(tags, mask)]
        address = [' '.join(address_components)]
    
    return(address)

In [6]:
# 8 min ca
address_list = []

for i, (l, f) in enumerate(zip(links, flags)):
   if((i>0) and (i%20==0)):
      print(f'Scraping link {i}/{len(links)}....')
   address_list.extend(find_address(l, f))

df = pd.DataFrame({'address': address_list})
df.head()

Scraping link 20/322....
Scraping link 40/322....
Scraping link 60/322....
Scraping link 80/322....
Scraping link 100/322....
Scraping link 120/322....
Scraping link 140/322....
Scraping link 160/322....
Scraping link 180/322....
Scraping link 200/322....
Scraping link 220/322....
Scraping link 240/322....
Scraping link 260/322....
Scraping link 280/322....
Scraping link 300/322....
Scraping link 320/322....


Unnamed: 0,address
0,17 cours Jean Nicoli 20090 AJACCIO FR
1,C.C. E.Leclerc - rue des Portes d'Albi ALBI FR
2,97 Avenue de Saint-Juéry ALBI FR
3,ZAC Les Portes de Bretagne 61000 ALENÇON FR
4,16 Place des Martyrs de la Résistance 30100 AL...


In [7]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/355
100/355
150/355
200/355
250/355
300/355
350/355


In [8]:
# check
df.head()

Unnamed: 0,address,latitude,longitude,city,postcode
0,17 cours Jean Nicoli 20090 AJACCIO FR,42.462065,1.937261,Ur,66760
1,C.C. E.Leclerc - rue des Portes d'Albi ALBI FR,43.923305,2.149627,Albi,81000
2,97 Avenue de Saint-Juéry ALBI FR,43.935898,2.179227,Albi,81000
3,ZAC Les Portes de Bretagne 61000 ALENÇON FR,48.432597,0.069882,Alençon,61000
4,16 Place des Martyrs de la Résistance 30100 AL...,44.126526,4.079116,Alès,30100


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   address    355 non-null    object 
 1   latitude   355 non-null    float64
 2   longitude  355 non-null    float64
 3   city       355 non-null    object 
 4   postcode   355 non-null    object 
dtypes: float64(2), object(3)
memory usage: 14.0+ KB


In [10]:
df.to_csv('../output/affleolu_shops_geocoded.csv', index=False)