### Este Notebook estara enfocado en el desanidado y normalizacion de los datos de la columna `category`. para filtrar el dataset original

In [2]:
import pandas as pd

In [None]:
# Leemos la tabla g_sitios que tenemos hasta el momento.
g_sitios = pd.read_parquet(r'C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\google_sitios.parquet', engine='pyarrow')

In [None]:
# Creamos una nueva tabla de categorias y desempaquetamos su información.
categorias = g_sitios[['gmap_id', 'category']].explode('category')

# Borramos las columnas `MISC`, `relative_results` y `category`
g_sitios = g_sitios.drop(columns=['MISC', 'relative_results', 'category'])

In [None]:
# Guaradamos la tabla desanidada que se conecta a travez de gmap_id con la tabla principal.
categorias.to_parquet('datasets/google/sitios/categories_sitios_google_todavia_nombres.parquet')

In [3]:
categorias = pd.read_parquet(r'C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\dataset_gastronomico_google.parquet', engine='pyarrow')
# Obtener lista única de categorías en 'category'
categorias_unicas = categorias['category'].unique()
# Mostrar la lista
print(categorias_unicas)

['Food court' 'Liquor store' 'Cocktail bar' 'Banquet hall' 'Caterer'
 'Fast food restaurant' 'Family restaurant' 'Seafood restaurant' 'Deli'
 'Dominican restaurant' 'Taco restaurant' 'Mexican restaurant'
 'Restaurant or cafe' 'Bar & grill' 'Food service' 'Dessert shop'
 'Ice cream shop' 'Breakfast restaurant' 'Beer store'
 'Fried chicken takeaway' 'Asian restaurant' 'Hot pot restaurant'
 'Chinese restaurant' 'Mediterranean restaurant' 'Bagel shop' 'Donut shop'
 'Takeout Restaurant' 'Pizza Takeout' 'Wine bar' 'Ramen restaurant'
 'Bubble tea store' 'Cafe' 'American restaurant'
 'Modern French restaurant' 'Flower delivery' 'Pizza restaurant'
 'Seafood market' 'Food products supplier' 'Beer distributor' 'Juice shop'
 'Health food store' 'Vegetarian restaurant' 'Sushi restaurant'
 'Japanese restaurant' 'Creole restaurant' 'Caribbean restaurant'
 'Barbecue restaurant' 'Chocolate shop' 'Coffee store' 'Snack bar'
 'Soft drinks shop' 'Pretzel store' 'Hamburger restaurant' 'Sandwich shop'
 'Soul

In [4]:
# Filtrar las categorias por una categoria general para un id_unico
categorias.loc[:, 'category'] = categorias['category'].str.lower()
categorias = categorias[~categorias['category'].str.contains('night club|buffet|school|food products supplier|court|banquet hall|supplier|thermal baths|spa|class', case=False, na=False)]

categorias.loc[:, 'category_general'] = None

categorias.loc[categorias['category'].str.contains('bar|shochu|liquor|beer|pub|cider|drink|wine|beverage|bodega|tavern|vineyard|sake', case=False, na=False), 'category_general'] = 'drinks'
categorias.loc[categorias['category'].str.contains('cafe|coffee|te|cafeteria|tea|patisserie|confectionery|lounge', case=False, na=False), 'category_general'] = 'coffee and te'
categorias.loc[categorias['category'].str.contains('restaurant|carvery|brasserie|bistro|restaurante|grill|food|bbq|pizza|soup|crab|crêperie', case=False, na=False), 'category_general'] = 'restaurant'
categorias.loc[categorias['category'].str.contains('shop|deli|frituur|greengrocer|fruit|hawker stall|wholesaler|store|supermarket|bakery|takeaway|delivery|service|stand|churreria|fast food|takeout|sweets|market', case=False, na=False), 'category_general'] = 'shops and takeaway'

In [5]:
# Crear un diccionario de categorías generales y sus IDs únicos
category_ids = {
    'drinks': 1,
    'coffee and te': 2,
    'restaurant': 3,
    'shops and takeaway': 4
}

# Asignar el ID único correspondiente a cada categoría general
categorias['category_general_id'] = categorias['category_general'].map(category_ids)
# Eliminar duplicados basados en la columna 'gmap_id' en el DataFrame 'categorias'
categorias_unicas = categorias.drop_duplicates(subset='gmap_id', keep='first')
categorias_unicas

Unnamed: 0,gmap_id,category,category_general,category_general_id
125,0x880e5523024703c1:0xb93a8ccb6918d616,liquor store,shops and takeaway,4
126,0x80c8be4e73e8263f:0x3edb275a351e6266,cocktail bar,drinks,1
142,0x88f3fb847f6a4833:0x91a0e1ef28d778ed,caterer,coffee and te,2
144,0x87528bb7ab938cb3:0x5ee996e34cc50715,fast food restaurant,shops and takeaway,4
198,0x54906bbaf4811877:0x8a101329509628b,liquor store,shops and takeaway,4
...,...,...,...,...
3024981,0x89c6c4072624d2a9:0x4437b2d55e9a5811,bagel shop,shops and takeaway,4
3024991,0x54957330545fc915:0xd14f2cb33733fb2b,deli,shops and takeaway,4
3024992,0x89c6b993431d9a19:0xea378de3c317996e,bar & grill,restaurant,3
3025005,0x89da6b8177ee49d5:0xbc9dfc53722fe1,bar & grill,restaurant,3


In [None]:
# Unimos los id con la tabla principal.
g_sitios= pd.merge(g_sitios, categorias_unicas[['gmap_id', 'category_general_id']], on='gmap_id', how='left')

In [9]:
# Eliminar filas donde 'category_id' sea None o NaN
g_sitios = g_sitios[g_sitios['category_general_id'].notna()]

# Convertir la columna 'category_id' a tipo int
g_sitios['category_general_id'] = g_sitios['category_general_id'].astype(int)

In [10]:
g_sitios.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,avg_rating,num_of_reviews,price,state,url,monday,tuesday,wednesday,thursday,friday,saturday,sunday,category_general_id
125,ROYAL LIQUOR,"ROYAL LIQUOR, 26W211 Geneva Rd, Wheaton, IL 60187",0x880e5523024703c1:0xb93a8ccb6918d616,,41.887341,-88.136456,3.6,8,,Open ⋅ Closes 10PM,https://www.google.com/maps/place//data=!4m2!3...,10AM–10PM,10AM–10PM,10AM–10PM,10AM–10PM,10AM–10:30PM,10AM–10:30PM,10AM–9PM,4
126,Onyx Bar,"Onyx Bar, 11011 W Charleston Blvd, Las Vegas, ...",0x80c8be4e73e8263f:0x3edb275a351e6266,,36.156205,-115.332636,4.7,8,,Closed ⋅ Opens 4PM,https://www.google.com/maps/place//data=!4m2!3...,4–11PM,4–11PM,4–11PM,4–11PM,4PM–1AM,4PM–1AM,4–11PM,1
142,Macon Banquet,"Macon Banquet, 3720 Bloomfield Village Dr, Mac...",0x88f3fb847f6a4833:0x91a0e1ef28d778ed,,32.814681,-83.700424,4.8,18,,Open ⋅ Closes 5PM,https://www.google.com/maps/place//data=!4m2!3...,10AM–5PM,10AM–5PM,10AM–5PM,10AM–5PM,10AM–5PM,12–5PM,12–5PM,2
144,Maverik Adventure's First Stop,"Maverik Adventure's First Stop, 3330 S 500 W, ...",0x87528bb7ab938cb3:0x5ee996e34cc50715,,40.698585,-111.906406,3.4,5,,Open 24 hours,https://www.google.com/maps/place//data=!4m2!3...,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,Open 24 hours,4
198,Barrels Experience Wine,"Barrels Experience Wine, 7593 SE 27th St #103,...",0x54906bbaf4811877:0x8a101329509628b,,47.5866,-122.236932,5.0,1,,Closed ⋅ Opens 3PM,https://www.google.com/maps/place//data=!4m2!3...,Closed,3–9PM,3–9PM,3–9PM,3–9PM,2–7PM,Closed,4


In [20]:
g_sitios.to_parquet('v0_g_sitios.parquet')

In [6]:
categorias

Unnamed: 0,gmap_id,category,category_general,category_general_id
125,0x880e5523024703c1:0xb93a8ccb6918d616,liquor store,shops and takeaway,4
126,0x80c8be4e73e8263f:0x3edb275a351e6266,cocktail bar,drinks,1
142,0x88f3fb847f6a4833:0x91a0e1ef28d778ed,caterer,coffee and te,2
144,0x87528bb7ab938cb3:0x5ee996e34cc50715,fast food restaurant,shops and takeaway,4
198,0x54906bbaf4811877:0x8a101329509628b,liquor store,shops and takeaway,4
...,...,...,...,...
3024992,0x89c6b993431d9a19:0xea378de3c317996e,bar & grill,restaurant,3
3025005,0x89da6b8177ee49d5:0xbc9dfc53722fe1,bar & grill,restaurant,3
3025006,0x89c6c74f43a49b55:0x6be6995921c58b12,steak house,coffee and te,2
3025006,0x89c6c74f43a49b55:0x6be6995921c58b12,fine dining restaurant,restaurant,3


In [7]:
categorias.to_parquet('v0_categorias.parquet')