In [18]:
import pandas
import json
import requests
import re
import asyncio
from urllib.parse import urlencode
import pathlib
from google_comments import MEDIA_PATH

In [2]:
# to_delete = pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*_comments.json')
# for file in to_delete:
#     if file.is_file() and file.exists():
#         file.unlink()

In [3]:
# files = list(pathlib.Path(MEDIA_PATH / 'aprium').glob('**/*.json'))

In [4]:
# def filter_func(path):
#     result = re.search(r'^[a-zA-Z0-9]+\_[\d+\-]+\_\d+\.json$', path.name)
#     if result:
#         return True
#     return False


# filtered_files = list(filter(filter_func, files))

## List files

In [5]:
files = list(pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*.json'))

In [6]:
print(f'Found {len(files)} files')

Found 470 files


In [7]:
def read_files(items):
    files_data = []
    for file in items:
        with open(file, mode='r', encoding='utf-8') as f:
            values = json.load(f)
            if not isinstance(values, list):
                print(f'{file} is not a list')
                values = [values]
            files_data.extend(values)
    return files_data


initial_data = read_files(files)

In [8]:
print(f'Contains {len(initial_data)} items')

Contains 470 items


In [9]:
def flatten(data):
    container = []
    for item in data:
        if isinstance(item, dict):
            reviews = item.pop('reviews')
            business_rating = item.pop('rating')
            for review in reviews:
                review.update(business_rating=business_rating, **item)
                container.append(review)
        else:
            print(f'Cannot treat item {item}')
    return container

data = flatten(initial_data)

In [10]:
df = pandas.DataFrame(data=data)

In [13]:
# final_df = df[['name', 'text', 'rating']]
final_df = df[['name', 'address', 'telephone', 'website']]
final_df = final_df.drop_duplicates(subset=['name'])

## Address completion

In [26]:
ZIP_CODE_REGEX = r'(?<=\,)\s?(?P<zip_code>\d{4,5})\s?(?P<city>.*)$'

def get_address_meta_data(name):
    def get_zip_code(value):
        if value is None:
            return None
        result = re.search(ZIP_CODE_REGEX, value)
        if result:
            return result.groupdict().get(name, None)
        return None
    return get_zip_code

final_df['zip_code'] = final_df['address'].map(get_address_meta_data('zip_code'))
final_df['city'] = final_df['address'].map(get_address_meta_data('city'))
final_df['city_details'] = None


async def address_requester(value):
    if value is None:
        return None
    query = urlencode({'q': value})
    url = f'https://api-adresse.data.gouv.fr/search/?{query}'
    response = requests.get(url, headers={'content-type': 'application/json'})
    
    if response.ok:
        data = response.json()
        try:
            city_details = data['features'][0]['properties']['context']
        except:
            return None
        else:
            print(f'Response completed for: {value}: {city_details}')
            return city_details
    return None


async def completed_addresses():
    for item in final_df.itertuples(name='Company'):
        city_details = await address_requester(item.address)
        if city_details is None:
            continue

        final_df.loc[item.Index, 'city_details'] = city_details
        await asyncio.sleep(1)

# await completed_addresses()

Response completed for: 22 Pl. des Gascons, 64100 Bayonne: 64, Pyrénées-Atlantiques, Nouvelle-Aquitaine
Response completed for: 762 Av. de Tournamy, 06250 Mougins: 06, Alpes-Maritimes, Provence-Alpes-Côte d'Azur
Response completed for: 17 Av. du Port, 34540 Balaruc-les-Bains: 34, Hérault, Occitanie
Response completed for: 10 Quai de la Georgette, 17000 La Rochelle: 17, Charente-Maritime, Nouvelle-Aquitaine
Response completed for: 86 Av. des Sables, 85440 Talmont-Saint-Hilaire: 85, Vendée, Pays de la Loire
Response completed for: Rue de l'Ouest Centre Commercial des Portes de, 78200 Buchelay: 78, Yvelines, Île-de-France
Response completed for: Rue de la Branche, 71230 Saint-Vallier: 71, Saône-et-Loire, Bourgogne-Franche-Comté
Response completed for: 10 Rue de Clermont, 42300 Roanne: 42, Loire, Auvergne-Rhône-Alpes
Response completed for: 105 Av. Felix Faure, 75015 Paris: 75, Paris, Île-de-France
Response completed for: 7 Rue des Petits Carreaux, 75002 Paris: 75, Paris, Île-de-France
Res

In [27]:
companies_array = final_df['name'].apply(lambda x: x.lower()).unique()

In [28]:
companies_df = final_df[['name']]
companies_df = pandas.DataFrame(
    {'unique_companies': companies_df['name'].unique()}
)
companies_df['unique_companies'] = companies_df['unique_companies'].apply(
    lambda x: x.lower())
companies_df.head()

Unnamed: 0,unique_companies
0,aprium pharmacie des gascons
1,aprium pharmacie de tournamy
2,aprium pharmacie des thermes (gros)
3,aprium pharmacie du gabut
4,aprium pharmacie de talmont


In [29]:
expected_companies = pandas.read_json(MEDIA_PATH / 'aprium_expected_companies.json')
expected_companies = expected_companies.rename(columns={0: 'companies'})
expected_companies['companies'] = expected_companies['companies'].apply(
    lambda x: x.lower()
)
expected_companies.describe()

Unnamed: 0,companies
count,459
unique,433
top,aprium pharmacie
freq,5


In [30]:
companies_df[~companies_df['unique_companies'].isin(expected_companies['companies'])]

Unnamed: 0,unique_companies
3,aprium pharmacie du gabut
27,pharmacie de saint-georges-de-reneins 💊 totum
29,💊pharmacie des pyrénées muret | aprium
40,"aprium pharmacie debas sophie, arlette & jean-..."
116,pharmacie du triangle - aprium
118,aprium grande pharmacie du centre
160,aprium pharmacie antiboise
177,aprium pharmacie moncade
186,aprium pharmacie kayal
222,aprium pharmacie des eaux claires


In [31]:
final_df.describe()

Unnamed: 0,name,address,telephone,website,zip_code,city,city_details
count,433,433,430,399,432,432,430
unique,433,432,429,398,348,326,79
top,Aprium Pharmacie des Gascons,"2 All. du Dr Lejzer Ludwik Zamenhof, 31100 Tou...",05 61 31 87 22,https://iena.aprium-pharmacie.fr/,59000,Paris,"75, Paris, Île-de-France"
freq,1,2,2,2,7,43,43


In [32]:
final_df.to_csv('final_file_vero.csv', index=False, encoding='utf-8')