In [24]:
import pandas
import json
import requests
import re
import asyncio
from urllib.parse import urlencode
import pathlib
from google_comments import MEDIA_PATH

In [25]:
# to_delete = pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*_comments.json')
# for file in to_delete:
#     if file.is_file() and file.exists():
#         file.unlink()

In [26]:
# files = list(pathlib.Path(MEDIA_PATH / 'aprium').glob('**/*.json'))

In [27]:
# def filter_func(path):
#     result = re.search(r'^[a-zA-Z0-9]+\_[\d+\-]+\_\d+\.json$', path.name)
#     if result:
#         return True
#     return False


# filtered_files = list(filter(filter_func, files))

## List files

In [28]:
files = list(pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*.json'))

In [29]:
print(f'Found {len(files)} files')

Found 470 files


In [30]:
def read_files(items):
    files_data = []
    for file in items:
        with open(file, mode='r', encoding='utf-8') as f:
            values = json.load(f)
            if not isinstance(values, list):
                print(f'{file} is not a list')
                values = [values]
            files_data.extend(values)
    return files_data


initial_data = read_files(files)

In [31]:
print(f'Contains {len(initial_data)} items')

Contains 470 items


In [32]:
def flatten(data):
    container = []
    for item in data:
        if isinstance(item, dict):
            reviews = item.pop('reviews')
            business_rating = item.pop('rating')
            for review in reviews:
                review.update(business_rating=business_rating, **item)
                container.append(review)
        else:
            print(f'Cannot treat item {item}')
    return container

data = flatten(initial_data)

In [33]:
df = pandas.DataFrame(data=data)

In [34]:
final_df = df[['name', 'text', 'rating', 'period', 'address']]
# final_df = df[['name', 'address', 'telephone', 'website']]
# final_df = final_df.drop_duplicates(subset=['name'])

## Address completion

In [35]:
ZIP_CODE_REGEX = r'(?<=\,)\s?(?P<zip_code>\d{4,5})\s?(?P<city>.*)$'

def get_address_meta_data(name):
    def get_zip_code(value):
        if value is None:
            return None
        result = re.search(ZIP_CODE_REGEX, value)
        if result:
            return result.groupdict().get(name, None)
        return None
    return get_zip_code

final_df['zip_code'] = final_df['address'].map(get_address_meta_data('zip_code'))
final_df['city'] = final_df['address'].map(get_address_meta_data('city'))
final_df['city_details'] = None


async def address_requester(value):
    if value is None:
        return None
    query = urlencode({'q': value})
    url = f'https://api-adresse.data.gouv.fr/search/?{query}'
    response = requests.get(url, headers={'content-type': 'application/json'})
    
    if response.ok:
        data = response.json()
        try:
            city_details = data['features'][0]['properties']['context']
        except:
            return None
        else:
            print(f'Response completed for: {value}: {city_details}')
            return city_details
    return None


async def completed_addresses():
    for item in final_df.itertuples(name='Company'):
        city_details = await address_requester(item.address)
        if city_details is None:
            continue

        final_df.loc[item.Index, 'city_details'] = city_details
        await asyncio.sleep(1)

# await completed_addresses()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['zip_code'] = final_df['address'].map(get_address_meta_data('zip_code'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['city'] = final_df['address'].map(get_address_meta_data('city'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['city_details'] = None


In [36]:
companies_array = final_df['name'].apply(lambda x: x.lower()).unique()

In [37]:
companies_df = final_df[['name']]
companies_df = pandas.DataFrame(
    {'unique_companies': companies_df['name'].unique()}
)
companies_df['unique_companies'] = companies_df['unique_companies'].apply(
    lambda x: x.lower())
companies_df.head()

Unnamed: 0,unique_companies
0,aprium pharmacie des gascons
1,aprium pharmacie de tournamy
2,aprium pharmacie des thermes (gros)
3,aprium pharmacie du gabut
4,aprium pharmacie de talmont


In [38]:
expected_companies = pandas.read_json(MEDIA_PATH / 'aprium_expected_companies.json')
expected_companies = expected_companies.rename(columns={0: 'companies'})
expected_companies['companies'] = expected_companies['companies'].apply(
    lambda x: x.lower()
)
expected_companies.describe()

Unnamed: 0,companies
count,459
unique,433
top,aprium pharmacie
freq,5


In [39]:
companies_df[~companies_df['unique_companies'].isin(expected_companies['companies'])]

Unnamed: 0,unique_companies
3,aprium pharmacie du gabut
27,pharmacie de saint-georges-de-reneins 💊 totum
29,💊pharmacie des pyrénées muret | aprium
40,"aprium pharmacie debas sophie, arlette & jean-..."
116,pharmacie du triangle - aprium
118,aprium grande pharmacie du centre
160,aprium pharmacie antiboise
177,aprium pharmacie moncade
186,aprium pharmacie kayal
222,aprium pharmacie des eaux claires


In [40]:
final_df.describe()

Unnamed: 0,name,text,rating,period,address,zip_code,city,city_details
count,37110,23751,37110,37110,37110,37075,37075,0.0
unique,433,22473,19,80,461,358,338,0.0
top,Pharmacie O Vert,Tres bien,5 etoiles,il y a un an,"15 Av. Jacques Cartier, 77600 Bussy-Saint-Georges",75015,Paris,
freq,490,87,24940,11098,490,869,4333,


In [41]:
final_df.to_csv('final_file2.csv', index=False, encoding='utf-8')