## Google Comments Flattener

This module takes JSON comment files containing reviews, groups them together to create a single Google comments csv file

In [1]:
import pandas
import json
import requests
import re
import asyncio
from urllib.parse import urlencode
import pathlib
from google_comments import MEDIA_PATH

In [2]:
# to_delete = pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*_comments.json')
# for file in to_delete:
#     if file.is_file() and file.exists():
#         file.unlink()

In [3]:
# def filter_func(path):
#     result = re.search(r'^[a-zA-Z0-9]+\_[\d+\-]+\_\d+\.json$', path.name)
#     if result:
#         return True
#     return False


# filtered_files = list(filter(filter_func, files))

List the the files that we will be using for regrouping the comments

In [4]:
# files = list(pathlib.Path(MEDIA_PATH / 'aprium/files_to_flatten').glob('**/*.json'))
# files = [MEDIA_PATH.joinpath('business_1f578c6c4b_2024-01-16-00_00.json')]

In [5]:
files = list(pathlib.Path(
    MEDIA_PATH / 'polygone').glob('**/business_*.json'))

In [6]:
print(f'Found {len(files)} files')

Found 1 files


In [7]:
def read_files(items):
    files_data = []
    for file in items:
        with open(file, mode='r', encoding='utf-8') as f:
            values = json.load(f)
            if not isinstance(values, list):
                print(f'{file} is not a list')
                values = [values]
            files_data.extend(values)
    return files_data


initial_data = read_files(files)

In [8]:
print(f'Contains {len(initial_data)} items')

Contains 1 items


In [9]:
def flatten(data):
    container = []
    for item in data:
        if isinstance(item, dict):
            reviews = item.pop('reviews')
            business_rating = item.pop('rating')
            for review in reviews:
                review.update(business_rating=business_rating, **item)
                container.append(review)
        else:
            print(f'Cannot treat item {item}')
    return container

data = flatten(initial_data)

In [10]:
df = pandas.DataFrame(data=data)

In [11]:
final_df = df[['name', 'text', 'rating', 'period', 'address']]
# final_df = df[['name', 'address', 'telephone', 'website']]
# final_df = final_df.drop_duplicates(subset=['name'])

In [12]:
final_df.head()

Unnamed: 0,name,text,rating,period,address
0,Polygone Riviera,,4 etoiles,il y a 15 heures Nouveau,"119 Av. des Alpes, 06800 Cagnes-sur-Mer"
1,Polygone Riviera,Je n'aime pas les centres commerciaux.... Mais...,5 etoiles,il y a 16 heures Nouveau,"119 Av. des Alpes, 06800 Cagnes-sur-Mer"
2,Polygone Riviera,Je me suis rendu a polygone pour voir le nouve...,5 etoiles,il y a 16 heures Nouveau,"119 Av. des Alpes, 06800 Cagnes-sur-Mer"
3,Polygone Riviera,Il est de bonne education de repondre aux avis...,2 etoiles,il y a 18 heures Nouveau,"119 Av. des Alpes, 06800 Cagnes-sur-Mer"
4,Polygone Riviera,,5 etoiles,il y a 20 heures Nouveau,"119 Av. des Alpes, 06800 Cagnes-sur-Mer"


## Address completion

This section will try and get the precise address for each business in the file

In [13]:
unique_addresses = df[~df['address'].duplicated()]['address']
unique_addresses.count()

1

In [14]:
ZIP_CODE_REGEX = r'(?<=\,)\s?(?P<zip_code>\d{4,5})\s?(?P<city>.*)$'

def get_address_meta_data(name):
    def get_zip_code(value):
        if value is None:
            return None
        result = re.search(ZIP_CODE_REGEX, value)
        if result:
            return result.groupdict().get(name, None)
        return None
    return get_zip_code

final_df['zip_code'] = final_df['address'].map(get_address_meta_data('zip_code'))
final_df['city'] = final_df['address'].map(get_address_meta_data('city'))
final_df['city_details'] = None


async def address_requester(value):
    if value is None:
        return None
    query = urlencode({'q': value})
    url = f'https://api-adresse.data.gouv.fr/search/?{query}'
    response = requests.get(url, headers={'content-type': 'application/json'})
    
    if response.ok:
        data = response.json()
        try:
            city_details = data['features'][0]['properties']['context']
        except:
            return None
        else:
            print(f'Response completed for: {value}: {city_details}')
            return city_details
    return None


async def completed_addresses():
    for item in final_df.itertuples(name='Company'):
        city_details = await address_requester(item.address)
        if city_details is None:
            continue

        final_df.loc[item.Index, 'city_details'] = city_details
        await asyncio.sleep(1)

# await completed_addresses()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['zip_code'] = final_df['address'].map(get_address_meta_data('zip_code'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['city'] = final_df['address'].map(get_address_meta_data('city'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['city_details'] = None


In [15]:
# companies_array = final_df['name'].apply(lambda x: x.lower()).unique()

In [16]:
# companies_df = final_df[['name']]
# companies_df = pandas.DataFrame(
#     {'unique_companies': companies_df['name'].unique()}
# )
# companies_df['unique_companies'] = companies_df['unique_companies'].apply(
#     lambda x: x.lower())
# companies_df.head()

In [17]:
# expected_companies = pandas.read_json(MEDIA_PATH / 'aprium_expected_companies.json')
# expected_companies = expected_companies.rename(columns={0: 'companies'})
# expected_companies['companies'] = expected_companies['companies'].apply(
#     lambda x: x.lower()
# )
# expected_companies.describe()

In [18]:
# companies_df[~companies_df['unique_companies'].isin(expected_companies['companies'])]

In [19]:
final_df.describe()

Unnamed: 0,name,text,rating,period,address,zip_code,city,city_details
count,3000,1508,3000,3000,3000,3000,3000,0.0
unique,1,1424,5,27,1,1,1,0.0
top,Polygone Riviera,...,5 etoiles,il y a un an,"119 Av. des Alpes, 06800 Cagnes-sur-Mer",6800,Cagnes-sur-Mer,
freq,3000,11,1820,1543,3000,3000,3000,


In [20]:
# By security remove all the semicolons that we eventually missed since this could 
# the data an Excel file
def remove_semicolon(dataframe):
    def refixer(value):
        if value is None:
            return None
        return value.replace(';', ' ')

    dataframe['text'] = dataframe['text'].map(refixer)
    return dataframe

final_df = final_df.pipe(remove_semicolon)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['text'] = dataframe['text'].map(refixer)


In [21]:
final_df.to_csv('cap_3000.csv', index=False, encoding='utf-8')