In [1]:
from PyPDF2 import PdfReader
import re
import pandas as pd
import numpy as np
from geopy.geocoders import IGNFrance

In [2]:
reader = PdfReader('../lista_negozi_audio2000.pdf')
pages_text = [page.extract_text() for page in reader.pages]
full_text = ''.join(pages_text).split('AUDIO 2000')

In [3]:
def process_string(str):

    str = str.strip()
    letter_match = re.search(r'\b[A-Z][A-Z\s]+\b', str)
    number_match = re.search('\d{5}', str)
    
    if letter_match and number_match:
        parts = str[:letter_match.end()], str[letter_match.end():number_match.start()]
        cleaned_parts = [part.strip() for part in parts]
        return ['audio2000 '+ cleaned_parts[0].lower(), cleaned_parts[1]]
    else:
        return [str.strip()]

process_string(full_text[1])

['audio2000 aire sur la lys', 'Centre commercial Val de Lys / Rue du bois']

In [4]:
res = [process_string(line) for line in full_text]
df = pd.DataFrame(res, columns=['shop_name', 'address']).dropna()
df

Unnamed: 0,shop_name,address
1,audio2000 aire sur la lys,Centre commercial Val de Lys / Rue du bois
2,audio2000 ajaccio,Résidence Espace Alban Cours Napoléon
3,audio2000 albert,14 avenue Georges Clémenceau
4,audio2000 angers,Rue du Grand Launay Centre commercial Grand Maine
5,audio2000 anneyron,Maison de Santé - 2 rue du Levant
...,...,...
168,audio2000 vincennes,61 rue de Fontenay
169,audio2000 vitrolles,Centre commercial CARREFOUR
170,audio2000 voiron,13 Avenue Dugueyt Jouvin
171,audio2000 wasselonne,5 place du Général Leclerc


In [5]:
# manual adjust
df.loc[df['shop_name']=='audio2000 taravao bp', 'address'] = 'BP 40026 Route de Varao PK 60'
df.loc[df['shop_name']=='audio2000 taravao bp', 'shop_name'] = 'audio2000 taravao' 

df.loc[df['shop_name']=='audio2000 papeete  tahiti bp', 'address'] = 'BP 40026 Fare Tony'
df.loc[df['shop_name']=='audio2000 papeete  tahiti bp', 'shop_name'] = 'audio2000 papeete tahiti' 

In [6]:
# geolocate shops
geolocator = IGNFrance()

df['latitude'] = np.nan
df['longitude'] = np.nan
df['city'] = np.nan
df['postcode'] = np.nan

for i in df.index:

    if ((i>0) and (i%50==0)):
        print(f'{i}/{len(df)}')

    try:
        #tries fetch address from geopy
        location = geolocator.geocode(df['address'][i], timeout=None)
        
        #append lat/long to column using dataframe location
        df.loc[i,'latitude'] = float(location.latitude)
        df.loc[i,'longitude'] = float(location.longitude)
        df.loc[i,'city'] = location.raw['commune']
        df.loc[i,'postcode'] = location.raw['postal_code']
        
    except:
        continue

50/170
100/170
150/170


In [7]:
df.head()

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
1,audio2000 aire sur la lys,Centre commercial Val de Lys / Rue du bois,45.464,-0.64109,Bois,17240
2,audio2000 ajaccio,Résidence Espace Alban Cours Napoléon,46.118162,4.33917,Cours,69470
3,audio2000 albert,14 avenue Georges Clémenceau,47.246745,4.88234,Clémencey,21220
4,audio2000 angers,Rue du Grand Launay Centre commercial Grand Maine,48.384712,5.486892,Grand,88350
5,audio2000 anneyron,Maison de Santé - 2 rue du Levant,46.088919,4.94888,Relevant,1990


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 1 to 172
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  170 non-null    object 
 1   address    170 non-null    object 
 2   latitude   155 non-null    float64
 3   longitude  155 non-null    float64
 4   city       155 non-null    object 
 5   postcode   155 non-null    object 
dtypes: float64(2), object(4)
memory usage: 13.4+ KB


In [9]:
mask = df['latitude'].isna() 
df[mask]

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
6,audio2000 antony,4 Avenue Aristide Briand,,,,
12,audio2000 balma,7 avenue Pierre Coupeau,,,,
13,audio2000 bastia,40 Boulevard Paoli,,,,
61,audio2000 grandvillars,16 Rue du Général Leclerc,,,,
64,audio2000 guebwiller,8 rue Théodore Deck,,,,
75,audio2000 la ville du bois,Centre commercial Carrefour,,,,
90,audio2000 manosque,1 Avenue Jean Giono,,,,
98,audio2000 montlucon,13 rue Albert Einstein,,,,
103,audio2000 mulhouse,7 Avenue Auguste Wicky,,,,
105,audio2000 munster,18 Rue Martin Hilti – ZI,,,,


In [10]:
def adjust_nas(df):
    
    for i in df.index:
        if np.isnan(df['latitude'][i]):
            address2 = df['address'][i] + ' ' + df['shop_name'][i].split()[1]
            #print(i, address2)
            try:
                location = geolocator.geocode(address2, timeout=None)

                df.loc[i,'address'] = address2
                df.loc[i,'latitude'] = float(location.latitude)
                df.loc[i,'longitude'] = float(location.longitude)
                df.loc[i,'city'] = location.raw['commune']
                df.loc[i,'postcode'] = location.raw['postal_code']
  
            except:
                continue
    return df

df = adjust_nas(df)

In [11]:
df[mask]

Unnamed: 0,shop_name,address,latitude,longitude,city,postcode
6,audio2000 antony,4 Avenue Aristide Briand antony,48.753629,2.305221,Antony,92160
12,audio2000 balma,7 avenue Pierre Coupeau balma,43.61073,1.497938,Balma,31130
13,audio2000 bastia,40 Boulevard Paoli bastia,42.701016,9.450367,Bastia,20200
61,audio2000 grandvillars,16 Rue du Général Leclerc grandvillars,47.539271,6.968402,Grandvillars,90600
64,audio2000 guebwiller,8 rue Théodore Deck guebwiller,47.901156,7.217302,Guebwiller,68500
75,audio2000 la ville du bois,Centre commercial Carrefour la,47.299217,-1.492362,Carquefou,44470
90,audio2000 manosque,1 Avenue Jean Giono manosque,43.831481,5.784263,Manosque,4100
98,audio2000 montlucon,13 rue Albert Einstein montlucon,46.347976,2.601651,Montluçon,3100
103,audio2000 mulhouse,7 Avenue Auguste Wicky mulhouse,47.743531,7.340363,Mulhouse,68100
105,audio2000 munster,18 Rue Martin Hilti – ZI munster,48.043023,7.160553,Munster,68140


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 1 to 172
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   shop_name  170 non-null    object 
 1   address    170 non-null    object 
 2   latitude   170 non-null    float64
 3   longitude  170 non-null    float64
 4   city       170 non-null    object 
 5   postcode   170 non-null    object 
dtypes: float64(2), object(4)
memory usage: 13.4+ KB


In [13]:
df.to_csv('../output/audio2000_shops_geocoded.csv', index=False)