In [1]:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
page = urllib3.PoolManager().request('GET', 'https://www.vivason.fr/centre-audition#zone:tous', headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(page.data, features='html.parser')

print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="fr" prefix="og: https://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <link href="https://www.vivason.fr/centre-audition" rel="canonical"/>
  <meta content="Drupal 9 (https://www.drupal.org); Commerce 2" name="Generator"/>
  <meta content="width" name="MobileOptimized"/>
  <meta content="true" name="HandheldFriendly"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
      'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-TQV7BFP');
  </script>
  <script type="application/ld+json">
   {
    "@context": "https://schema.org",
    "@type": "BreadcrumbList",
    "itemListElement": [
        {
            "@type": "ListItem",
          

In [3]:
store_tags = soup.find_all('article', {'class':"boutique"})

def decode_store_item(tag):
    name = tag.find('h2', {'class': 'title'}).get_text().strip('\n')
    address = ' '.join(tag.find('div', {'class':"field field--name-field-adresse field--type-address field--label-hidden field__item"}).get_text().split('\n'))
    return name, address

decode_store_item(store_tags[0])

('01 - Vivason Bourg-en-Bresse',
 '3 rue René Cassin 01000 Bourg-en-Bresse France')

In [4]:
res = [decode_store_item(tag) for tag in store_tags]
df = pd.DataFrame(res, columns=['shop_name', 'address'])

In [5]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/70


In [6]:
# check
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
0,01 - Vivason Bourg-en-Bresse,3 rue René Cassin 01000 Bourg-en-Bresse France,46.206821,5.226367,Bourg-en-Bresse,1000
1,06 - Vivason Nice,20 avenue Notre-Dame 06000 Nice France,43.704077,7.268075,Nice,6000
2,06 - Vivason Cagnes-sur-Mer,83 avenue de la gare 06800 Cagnes-sur-Mer France,43.662502,7.149372,Cagnes-sur-Mer,6800
3,10 - Vivason Troyes,24 Rue de la République 10000 Troyes France,48.29771,4.074422,Troyes,10000
4,13 - Vivason Marseille,10 boulevard baille 13006 Marseille France,43.286254,5.385543,Marseille,13006


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  70 non-null     object 
 1   address    70 non-null     object 
 2   latitude   70 non-null     float64
 3   longitude  70 non-null     float64
 4   city       70 non-null     object 
 5   postcode   70 non-null     object 
dtypes: float64(2), object(4)
memory usage: 3.4+ KB


In [8]:
df.to_csv('../output/vivason_shops_geocoded.csv', index=False)