In [1]:
import re
import pandas
import pathlib
from google_comments.utilities.calculation import convert_coordinates
from google_comments import MEDIA_PATH
from matplotlib import pyplot

48° 4.998 -1° 40.81392


In [2]:
RESTAURANTS = [
    'le fournil des champs',
    'le kimberley',
    '5àsec',
    'burger king',
    'mcdonald'
    'brioche dorée',
    'la déjeunerie',
    'la trattoria'
]

BANKS = [
    'crédit agricole',
    'crédit mutuel',
    'pharmacie',
    'carrefour banque',
    'carrefour voyage',
    'carrefour location',
    'carrefour spectacles',
    'exepxion',
    'accueil',
    'feu vert',
    'atol',
    'opticien',
    'norauto',
    'lcl'
]

In [3]:
dfs = [pandas.read_json(file) for file in pathlib.Path().joinpath(MEDIA_PATH, 'magasins_centre_commerciaux').glob('**/*.json')]

In [4]:
df = pandas.DataFrame()

In [5]:
df = pandas.concat(dfs, ignore_index=True)

In [6]:
df.columns

Index(['name', 'url', 'feed_url', 'address', 'rating', 'latitude', 'longitude',
       'number_of_reviews', 'date', 'additional_information', 'telephone',
       'website', 'reviews'],
      dtype='object')

In [7]:
df = df[['name', 'address', 'rating', 'latitude', 'longitude', 'number_of_reviews']]

In [8]:
df = df.sort_values('name')

In [9]:
df['name'].describe()

count               557
unique              370
top       Histoire d'Or
freq                  8
Name: name, dtype: object

In [10]:
def parse_rating(value):
    if value is None:
        return None
    result = re.search(r'(\d+,\d+)', value)
    if result:
        return result.group(1).replace(',', '.')

df['rating'] = df['rating'].map(parse_rating)

In [11]:
df['number_of_reviews'] = df['number_of_reviews'].fillna(0)

In [12]:
df = df.astype({'number_of_reviews': 'int64'})

In [13]:
unique_addresses = df[~df['address'].duplicated()]

In [14]:
df['coordinates'] = None

In [15]:
for item in df.itertuples(name='Business'):
    group = df.loc[item.Index, ['latitude', 'longitude']]
    result = convert_coordinates(group.latitude, group.longitude, as_dmm=False)
    df.loc[item.Index, 'coordinates'] = result

In [16]:
df.head()

Unnamed: 0,name,address,rating,latitude,longitude,number_of_reviews,coordinates
506,5àsec,"5 Rue du Bosphore, 35000 Rennes",3.6,48.0833,-1.680232,154,"48°4'59"" -1°40'48"""
353,Accueil Grand Quartier,"Centre Commercial Grand Quartier, Ille-et-Vila...",3.0,48.136931,-1.695569,5,"48°8'12"" -1°41'44"""
93,Action Rennes Pacé,"7 Bd de la Giraudais, 35740 Rennes",4.2,48.138383,-1.767137,541,"48°8'18"" -1°46'1"""
257,Adopt Parfum,"Alma, Centre Commercial Carrefour, 5 Rue du Bo...",4.8,48.083689,-1.677599,245,"48°5'1"" -1°40'39"""
314,Adopt Parfums,"Centre Commercial Carrefour Rennes, 35510 Cess...",4.8,48.112934,-1.592359,162,"48°6'46"" -1°35'32"""


In [17]:
# df.to_csv('gonz_all.csv', index=False)

In [18]:
w = df.drop_duplicates()
w = w.sort_values(by=['name'])
w.name = w.name.map(lambda x: x.lower().title())
w.to_csv('duplicates.csv', index=False)

In [19]:
no_duplicates_df = df.drop_duplicates()
no_duplicates_df.to_csv('simple.csv', index=False)

In [20]:
df['is_restaurant'] = None
df['is_bank'] = None

def business_state(values_to_check):
    def wrapper(value):
        if value is None:
            return False

        truth_array = []
        for name in values_to_check:
            if name in value.lower():
                truth_array.append(True)
        return any(truth_array)
    return wrapper

df['is_restaurant'] = df['name'].map(business_state(RESTAURANTS))
df['is_bank'] = df['name'].map(business_state(BANKS))


In [21]:
restaurants = df[df['is_restaurant'] == True]

In [22]:
banks = df[df['is_bank'] == True]

In [23]:
final_df = df[~df['is_bank'] & ~df['is_restaurant']]

In [24]:
final_df.count()

name                 499
address              498
rating               496
latitude             499
longitude            499
number_of_reviews    499
coordinates          499
is_restaurant        499
is_bank              499
dtype: int64

In [25]:
final_df = final_df.get(['name', 'address', 'rating', 'latitude',
                        'longitude', 'number_of_reviews', 'coordinates'])

In [26]:
# s = df.groupby('address')['address'].count()
# axes, fig = pyplot.subplots()
# pyplot.bar(s.index[:5], s.values[:5], color='g')
# pyplot.xticks(rotation=45, ha='right')
# pyplot.show()

In [27]:
final_df.to_csv('gonz.csv', index=False)