# ETL archivos Yelp
El objetivo es poder entender la naturaleza, formato y estructura que conteinen los archivos de Yelp para luego realizar el proceso de ETL.

* business.pkl
* review.json
* user.parquet
* checkin.json
* tip.json


In [1]:
import pandas as pd
import numpy as np
import requests
import glob
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import pyarrow.parquet as pq
import json
import pickle

## checkin

In [2]:
def abrir_Archivo_json(archivo):
    merged_data = []  # Lista para almacenar los objetos JSON combinados

    with open(archivo) as file:
        for line in file:
            try:
                obj = json.loads(line)
                merged_data.append(obj)
            except json.JSONDecodeError as e:
                print(f"Error al decodificar JSON en el archivo {archivo}: {str(e)}")

    df = pd.DataFrame(merged_data)  # Crear DataFrame a partir de los objetos JSON
    return df


In [3]:
archivo = 'data/Yelp/checkin.json' 
df_checkin_yelp = abrir_Archivo_json(archivo)

In [4]:
#Revisamos si tiene nulos
df_checkin_yelp.isnull().sum()

business_id    0
date           0
dtype: int64

In [5]:
#Revisamos si hay duplicados
df_checkin_yelp.duplicated().sum()

0

Desanidado de Fecha y Hora

In [6]:
fecha = []
for _, row in df_checkin_yelp.iterrows():
    business_id = row['business_id']
    dates = row['date'].split(', ')
    for date in dates:
        time, date = date.split(' ')
        fecha.append([business_id, time, date])

df_desanidado = pd.DataFrame(fecha,columns =['business_id','hour','date'])

In [7]:
df_desanidado

Unnamed: 0,business_id,hour,date
0,---kPU91CF4Lq2-WlRu9Lw,2020-03-13,21:10:56
1,---kPU91CF4Lq2-WlRu9Lw,2020-06-02,22:18:06
2,---kPU91CF4Lq2-WlRu9Lw,2020-07-24,22:42:27
3,---kPU91CF4Lq2-WlRu9Lw,2020-10-24,21:36:13
4,---kPU91CF4Lq2-WlRu9Lw,2020-12-09,21:23:33
...,...,...,...
13356870,zzu6_r3DxBJuXcjnOYVdTw,2013-12-11,00:52:49
13356871,zzu6_r3DxBJuXcjnOYVdTw,2013-12-13,00:58:14
13356872,zzw66H6hVjXQEt0Js3Mo4A,2016-12-03,23:33:26
13356873,zzw66H6hVjXQEt0Js3Mo4A,2018-12-02,19:08:45


In [8]:
#Se almacena en un csv
df_desanidado.to_csv('data/Yelp/checkin.csv',index = False)

## user.parquet

## business.pkl

In [9]:
#se lee el archivo
df_business= pd.read_pickle('data/Yelp/business.pkl')

In [10]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,IN,T6J 5H2,53.468419,-113.492054,3.0,13,...,,,,,,,,,,
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,DE,37204,36.115118,-86.766925,4.0,5,...,,,,,,,,,,
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,AB,46250,39.908707,-86.065088,3.5,8,...,,,,,,,,,,
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,AB,62025,38.782351,-89.950558,4.0,24,...,,,,,,,,,,


In [11]:
#Se agregan nuevas columnas en el Dataframe Vacias
df_business['NAME']=None
df_business['REVIEW_COUNT']=None
df_business['POSTAL_CODE']=None
df_business['CITY']=None
df_business['STATE']=None
df_business['BUSINESS_ID']=None
df_business['ADDRESS']=None
df_business['LATITUDE']=None
df_business['LONGITUDE']=None
df_business['STARS']=None
df_business['IS_OPEN']=None
df_business['ATTRIBUTES']=None
df_business['CATEGORIES']=None
df_business['HOURS']=None

In [12]:
#Se actualiza la columna NAME del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.name.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['NAME'][index]=''.join(arr)


In [13]:

# Configurar pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)


In [14]:
#Se actualiza la columna CITY del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.city.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['CITY'][index]=''.join(arr)

In [15]:
#Se actualiza la columna STATE del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.state.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['STATE'][index]=''.join(arr)


In [16]:
#Se actualiza la columna REVIEW_COUNT del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.review_count.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['REVIEW_COUNT'][index]=''.join(arr)


In [17]:
#Se actualiza la columna BUSINESS_ID del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.business_id.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['BUSINESS_ID'][index]=''.join(arr)


In [18]:
#Se actualiza la columna ADDRESS del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.address.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['ADDRESS'][index]=''.join(arr)


In [19]:
#Se actualiza la columna POSTAL_CODE del Dataframe donde se fusionan las cadenas de texto, eliminando los caracteres no Alfabeticos
for index,i in enumerate(df_business.postal_code.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['POSTAL_CODE'][index]=''.join(arr)

In [20]:
#Se filtran los elementos numéricos mayores que 1 en cada valor de la columna 'latitude' del DataFrame 'df_business'.
# A continuación, se asigna el primer elemento filtrado a la columna 'LATITUDE' en el DataFrame 'df_business' en la fila
#correspondiente al índice actual.
for index,i in enumerate(df_business.latitude.values):
    arr=[]
    for e in i:
       if e>1:
         arr.append(e)
    df_business['LATITUDE'][index]=arr[0]


In [21]:
#Se filtran los elementos numéricos menores que -1 en cada valor de la columna 'longitude' del DataFrame 'df_business'.
#Luego, se asigna el primer elemento filtrado a la columna 'LONGITUDE' en el DataFrame 'df_business' en la fila correspondiente al
# índice actual.
for index,i in enumerate(df_business.longitude.values):
    arr=[]
    for e in i:
        if e<-1:
            arr.append(e)
    df_business['LONGITUDE'][index]=arr[0]

In [22]:
#Se filtran los elementos numéricos mayores que 0.1 en cada valor de la columna 'stars' del DataFrame 'df_business'.
# Después, se asigna el primer elemento filtrado a la columna 'STARS' en el DataFrame 'df_business' en la fila correspondiente 
#al índice actual.
for index,i in enumerate(df_business.stars.values):
    arr=[]
    for e in i:
       if e>0.1:
         arr.append(e)
    df_business['STARS'][index]=arr[0]


In [23]:
#Se filtran los elementos numéricos mayores o iguales a 0 en cada valor de la columna 'is_open' del DataFrame 'df_business'. 
#A continuación, se asigna el primer elemento filtrado a la columna 'IS_OPEN' en el DataFrame 'df_business' en la fila correspondiente al índice actual

for index,i in enumerate(df_business.is_open.values):
    arr=[]
    for e in i:
       if e >=0:
         arr.append(e)
    df_business['IS_OPEN'][index]=arr[0]

In [24]:
#Este código busca y guarda el primer diccionario encontrado en la columna 'attributes' del dataframe 'df_business' en la columna 'ATTRIBUTES'.
for index,i in enumerate(df_business.attributes.values):
    arr=[]
    for e in i:
        if isinstance(e,dict):
         arr.append(e)
    if len(arr)>0:
     df_business['ATTRIBUTES'][index]=arr[0]


In [25]:
#Este código filtra los elementos de tipo cadena en la columna 'categories' del dataframe 'df_business'. 
#Luego, fusiona todas las cadenas filtradas en una sola cadena y la asigna a la columna 'CATEGORIES' en el dataframe 'df_business'.
for index,i in enumerate(df_business.categories.values):
    arr=[]
    for e in i:
        if isinstance(e,str):
         arr.append(e)
    df_business['CATEGORIES'][index]=''.join(arr)

In [26]:
#En este código, se recorren los valores de la columna 'hours' en el dataframe 'df_business' y se filtran los elementos que son diccionarios. 
#Luego, se asigna el primer diccionario encontrado a la columna 'HOURS' en el dataframe 'df_business' en la fila correspondiente.
#En resumen, el código extrae y guarda el primer diccionario encontrado en la columna 'hours' del dataframe 'df_business' en la columna 'HOURS'.
for index,i in enumerate(df_business.hours.values):
    arr=[]
    for e in i:
        if isinstance(e,dict):
         arr.append(e)
    if len(arr)>0:     
     df_business['HOURS'][index]=arr[0]


In [27]:
df_business.sample(10)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,business_id.1,name.1,address.1,city.1,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open.1,attributes.1,categories.1,hours.1,NAME,REVIEW_COUNT,POSTAL_CODE,CITY,STATE,BUSINESS_ID,ADDRESS,LATITUDE,LONGITUDE,STARS,IS_OPEN,ATTRIBUTES,CATEGORIES,HOURS
141388,2EmAukTYm2mOBEIKEYBRNg,Nick's Bar and Grille,16 S 2nd St,Philadelphia,PA,19106,39.949557,-75.14398,4.0,181,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","American (Traditional), Sandwiches, Restaurant...","{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ...",,,,,,,,,,,,,,,Nick's Bar and Grille,,19106,Philadelphia,PA,2EmAukTYm2mOBEIKEYBRNg,16 S 2nd St,39.949557,-75.14398,4.0,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","American (Traditional), Sandwiches, Restaurant...","{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ..."
57806,0z6wOy39zBALHmj-aFxB7g,Andale Andale,3632 Powelton Ave,Philadelphia,PA,19104,39.959557,-75.195097,4.5,7,0,"{'RestaurantsTakeOut': 'None', 'RestaurantsDel...","Restaurants, Mexican",,,,,,,,,,,,,,,,Andale Andale,,19104,Philadelphia,PA,0z6wOy39zBALHmj-aFxB7g,3632 Powelton Ave,39.959557,-75.195097,4.5,0,"{'RestaurantsTakeOut': 'None', 'RestaurantsDel...","Restaurants, Mexican",
5789,7xKGm9OxjRJA8b9n55gZ8A,Red Apple,230 N Main St,Columbia,CA,62236,38.444484,-90.201492,2.0,5,0,"{'HasTV': 'False', 'RestaurantsGoodForGroups':...","Restaurants, Breakfast & Brunch",,,,,,,,,,,,,,,,Red Apple,,62236,Columbia,CA,7xKGm9OxjRJA8b9n55gZ8A,230 N Main St,38.444484,-90.201492,2.0,0,"{'HasTV': 'False', 'RestaurantsGoodForGroups':...","Restaurants, Breakfast & Brunch",
119225,Ki2BtVpfsuY07u7j-qtXbQ,Sticks & Stones,1909 E Passyunk Ave,Philadelphia,MO,19148,39.926358,-75.167504,3.0,28,0,"{'GoodForKids': 'False', 'OutdoorSeating': 'Tr...","American (New), Restaurants, Nightlife, Bars","{'Monday': '16:0-2:0', 'Tuesday': '16:0-2:0', ...",,,,,,,,,,,,,,,Sticks & Stones,,19148,Philadelphia,MO,Ki2BtVpfsuY07u7j-qtXbQ,1909 E Passyunk Ave,39.926358,-75.167504,3.0,0,"{'GoodForKids': 'False', 'OutdoorSeating': 'Tr...","American (New), Restaurants, Nightlife, Bars","{'Monday': '16:0-2:0', 'Tuesday': '16:0-2:0', ..."
4627,-6KL5N_AdaZnw3W1HHtEtA,Unique Thriftique,7332 E Broadway Blvd,Tucson,PA,85710,32.220718,-110.835479,2.5,8,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Thrift Stores, Shopping",,,,,,,,,,,,,,,,Unique Thriftique,,85710,Tucson,PA,-6KL5N_AdaZnw3W1HHtEtA,7332 E Broadway Blvd,32.220718,-110.835479,2.5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Thrift Stores, Shopping",
17137,ExauHeoFNEB3EpUb3xYfSw,Native Grill & Wings,11107 N Oracle Rd,Oro Valley,FL,85737,32.407764,-110.945588,3.0,77,0,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...","Restaurants, Chicken Wings, American (Traditio...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",,,,,,,,,,,,,,,Native Grill & Wings,,85737,Oro Valley,FL,ExauHeoFNEB3EpUb3xYfSw,11107 N Oracle Rd,32.407764,-110.945588,3.0,0,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...","Restaurants, Chicken Wings, American (Traditio...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
31121,poezOrkxS1pY3kj6ll7Jxg,All Stars Haircut,433 W Cheltenham Ave,Elkins Park,TN,19027,40.055566,-75.125828,5.0,11,1,"{'BikeParking': 'True', 'GoodForKids': 'True',...","Men's Hair Salons, Barbers, Beauty & Spas, Hai...","{'Monday': '9:30-18:30', 'Tuesday': '9:30-18:3...",,,,,,,,,,,,,,,All Stars Haircut,,19027,Elkins Park,TN,poezOrkxS1pY3kj6ll7Jxg,433 W Cheltenham Ave,40.055566,-75.125828,5.0,1,"{'BikeParking': 'True', 'GoodForKids': 'True',...","Men's Hair Salons, Barbers, Beauty & Spas, Hai...","{'Monday': '9:30-18:30', 'Tuesday': '9:30-18:3..."
89747,h2S_6llTHGwD_OEoxTZXjA,Westshore Pizza,3327 Lithia Pinecrest Rd,Valrico,PA,33596,27.894738,-82.245265,2.5,19,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Food, Restaurants, Pizza, Italian","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",,,,,,,,,,,,,,,Westshore Pizza,,33596,Valrico,PA,h2S_6llTHGwD_OEoxTZXjA,3327 Lithia Pinecrest Rd,27.894738,-82.245265,2.5,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Food, Restaurants, Pizza, Italian","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
51079,tmGDBz8vnE5It5EPhWmEDg,Branmar Liquors,"1812 Marsh Rd, Ste 303",Wilmington,FL,19810,39.804528,-75.506933,4.0,8,1,"{'BikeParking': 'True', 'RestaurantsPriceRange...","Food, Beer, Wine & Spirits","{'Monday': '9:0-22:0', 'Tuesday': '9:0-22:0', ...",,,,,,,,,,,,,,,Branmar Liquors,,19810,Wilmington,FL,tmGDBz8vnE5It5EPhWmEDg,"1812 Marsh Rd, Ste 303",39.804528,-75.506933,4.0,1,"{'BikeParking': 'True', 'RestaurantsPriceRange...","Food, Beer, Wine & Spirits","{'Monday': '9:0-22:0', 'Tuesday': '9:0-22:0', ..."
139865,JU4QWMtJW0yiyS9W7dAFpQ,Extended Stay America - St. Louis - Westport -...,2030 Craig Rd.,Saint Louis,AZ,63146,38.696299,-90.44153,1.5,14,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Hotels, Event Planning & Services, Hotels & Tr...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",,,,,,,,,,,,,,,Extended Stay America - St. Louis - Westport -...,,63146,Saint Louis,AZ,JU4QWMtJW0yiyS9W7dAFpQ,2030 Craig Rd.,38.696299,-90.44153,1.5,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Hotels, Event Planning & Services, Hotels & Tr...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [28]:
df_business.to_csv('data/Yelp/business.csv',index = False)

In [29]:
dtypes = {'col14': str, 'col15': str, 'col16': str, 'col17': str, 'col18': str, 'col19': str, 'col25': str, 'col26': str, 'col27': str}
df_business = pd.read_csv('data/Yelp/business.csv', dtype=dtypes)

  df_business = pd.read_csv('data/Yelp/business.csv', dtype=dtypes)


In [30]:
#Filtramos el dataframe solo a las columnas nuevas generadas con los bucles, con los nombres iniciales


In [31]:
# Obtén la lista de todas las columnas
all_columns = df_business.columns

# Itera sobre todas las columnas y aplica el método .loc
for col in all_columns:
    df_business.loc[:, col] = df_business[col]


In [32]:
df_business = df_business[['BUSINESS_ID', 'NAME', 'REVIEW_COUNT', 'CITY', 'STATE', 'ADDRESS', 'POSTAL_CODE', 'LATITUDE', 'LONGITUDE', 'STARS', 'IS_OPEN', 'ATTRIBUTES', 'CATEGORIES', 'HOURS']]

df_business.loc[:, 'business_id'] = df_business['BUSINESS_ID']
df_business.loc[:, 'name'] = df_business['NAME']
df_business.loc[:, 'address'] = df_business['ADDRESS']
df_business.loc[:, 'city'] = df_business['CITY']
df_business.loc[:, 'state'] = df_business['STATE']
df_business.loc[:, 'postal_code'] = df_business['POSTAL_CODE']
df_business.loc[:, 'latitude'] = df_business['LATITUDE']
df_business.loc[:, 'longitude'] = df_business['LONGITUDE']
df_business.loc[:, 'stars'] = df_business['STARS']
df_business.loc[:, 'review_count'] = df_business['REVIEW_COUNT']
df_business.loc[:, 'is_open'] = df_business['IS_OPEN']
df_business.loc[:, 'attributes'] = df_business['ATTRIBUTES']
df_business.loc[:, 'categories'] = df_business['CATEGORIES']
df_business.loc[:, 'hours'] = df_business['HOURS']

df_business = df_business[['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']]


In [33]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,IN,T6J 5H2,53.468419,-113.492054,3.0,,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,DE,37204,36.115118,-86.766925,4.0,,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,AB,46250,39.908707,-86.065088,3.5,,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,AB,62025,38.782351,-89.950558,4.0,,1,"{'BusinessParking': ""{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [34]:
filtro = ['Restaurant']

# Filtrar el DataFrame solo para las filas donde 'categories' no es NaN
df_filtered = df_business.dropna(subset=['categories'])

# Luego, aplicar el filtro
restaurante = df_filtered[df_filtered['categories'].str.contains('|'.join(filtro), regex=True)]


In [35]:
restaurante

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,AZ,37015,36.269593,-87.058943,2.0,,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,TN,63123,38.565165,-90.321087,3.0,,0,"{'Caters': 'True', 'Alcohol': ""u'full_bar'"", '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,MO,37207,36.208102,-86.768170,1.5,,1,"{'RestaurantsAttire': ""'casual'"", 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,MO,33602,27.955269,-82.456320,4.0,,1,"{'Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,TN,19018,39.925656,-75.310344,3.0,,1,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,1181 N Milwaukee St,Boise,ID,83704,43.615401,-116.284689,4.0,,1,"{'WiFi': ""'free'"", 'RestaurantsGoodForGroups':...","Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-17:0', '..."
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,MO,19147,39.935982,-75.158665,4.5,,1,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
150339,2O2K6SXPWv56amqxCECd4w,The Plum Pit,4405 Pennell Rd,Aston,PA,19014,39.856185,-75.427725,4.5,,1,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."


In [36]:
restaurante.to_csv('data/Yelp/restaurante.csv')

In [37]:
df_business.to_csv('data/Yelp/business.csv',index = False)

## tip.json

In [38]:
json_objects=[]

with open('data/Yelp/tip.json', 'r',encoding='utf-8') as f:
    for line in f:
        json_objects.append(json.loads(line))
df_tip = pd.DataFrame(json_objects)

In [39]:
df_tip

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0
...,...,...,...,...,...
908910,eYodOTF8pkqKPzHkcxZs-Q,3lHTewuKFt5IImbXJoFeDQ,Disappointed in one of your managers.,2021-09-11 19:18:57,0
908911,1uxtQAuJ2T5Xwa_wp7kUnA,OaGf0Dp56ARhQwIDT90w_g,Great food and service.,2021-10-30 11:54:36,0
908912,v48Spe6WEpqehsF2xQADpg,hYnMeAO77RGyTtIzUSKYzQ,Love their Cubans!!,2021-11-05 13:18:56,0
908913,ckqKGM2hl7I9Chp5IpAhkw,s2eyoTuJrcP7I_XyjdhUHQ,Great pizza great price,2021-11-20 16:11:44,0


## review.json

In [40]:
def abrir_Archivo_json(archivo):
    merged_data = []  # Lista para almacenar los objetos JSON combinados

    with open(archivo,encoding = 'utf-8') as file:
        for line in file:
            try:
                obj = json.loads(line)
                merged_data.append(obj)
            except json.JSONDecodeError as e:
                print(f"Error al decodificar JSON en el archivo {archivo}: {str(e)}")

    df = pd.DataFrame(merged_data)  # Crear DataFrame a partir de los objetos JSON
    return df

In [41]:
archivo = ('data/Yelp/review.json')
review=abrir_Archivo_json(archivo)

In [42]:
review

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5.0,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5.0,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [44]:
restaurante_review = pd.merge(restaurante,review, on = 'business_id',how='inner')

In [46]:
restaurante_review.to_csv('data/Yelp/restaurante_review.csv',index = False)