In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
page = urllib3.PoolManager().request('GET', 'https://www.audilab.fr/centres-audilab/', headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.data, features='html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html lang="fr-FR">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
   <!-- This site is optimized with the Yoast SEO plugin v20.1 - https://yoast.com/wordpress/plugins/seo/ -->
   <title>
    Centres Audilab - Centre auditif audioprothésiste et bilan auditif - Audilab
   </title>
   <link href="https://www.audilab.fr/centres-audilab/" rel="canonical"/>
   <meta content="fr_FR" property="og:locale">
    <meta content="article" property="og:type">
     <meta content="Centres Audilab - Centre auditif audioprothésiste et bilan auditif - Audilab" property="og:title"/>
     <meta content="https://www.audilab.fr/centres-audilab/" property="og:url"/>
     <meta content="Centre auditif audioprothésiste et bilan auditif - Audilab" property="og:site_name"/>
 

In [3]:
store_tags = soup.find_all('div', {'class':'store-item'})

def decode_store_item(tag):
    name = ''.join(tag.find('div', {'class': 'store-item__title'}).get_text().split('/'))
    address = tag.find('div', {'class': 'store-item__address'}).get_text()
    return name, address

decode_store_item(store_tags[0])

('Audilab  Audioprothésiste 09', '38, allée des Pins 13009 Marseille')

In [4]:
res = [decode_store_item(tag) for tag in store_tags]
df = pd.DataFrame(res, columns=['shop_name', 'address'])

In [5]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/277
100/277
150/277
200/277
250/277


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  277 non-null    object 
 1   address    277 non-null    object 
 2   latitude   277 non-null    float64
 3   longitude  277 non-null    float64
 4   city       277 non-null    object 
 5   postcode   277 non-null    object 
dtypes: float64(2), object(4)
memory usage: 13.1+ KB


In [7]:
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,Audilab Audioprothésiste 09,"38, allée des Pins 13009 Marseille",43.253827,5.417374,Marseille,13009
1,Audilab Audioprothésiste Agde,Boulevard René Cassin 34300 Agde,43.305552,3.488563,Agde,34300
2,Audilab Audioprothésiste Aigrefeuille d'Aunis,12 passage des Halles 17290 Aigrefeuille-d'Aunis,46.117235,-0.935158,Aigrefeuille-d'Aunis,17290
3,Audilab Audioprothésiste Aix-les-Bains,213 avenue Marie de Solms 73100 Aix-les-Bains,45.688,5.910355,Aix-les-Bains,73100
4,Audilab Audioprothésiste Aizenay,1 bis place de l'Aire Buron 85190 Aizenay,46.739594,-1.607899,Aizenay,85190


In [8]:
df.to_csv('../output/audilab_shops_geocoded.csv', index=False)