### Este Notebook esta enfocado en como conectar los estados con las reviews y Yelp

La columna `address` de la tabla principal `g_sitios` contiene informacion importante sobre como conectar las reviews de los estados y los negocios.

si se normaliza esta columna en una tabla aparte que contenga: la ciudad, la direccion, y codigo postal, mas la clave unica gmap_id, podemos agrupar por estados y crear un identificador unico.

con un eda sobre las categorias de los negocios en cada estado, podremos decidir que estados usar y cuales no, basandonos en datos y no en suposiciones.

In [1]:
import pandas as pd
import os

# Leemos la tabla g_sitios que tenemos hasta el momento.
g_sitios = pd.read_parquet(r'C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\v0_g_sitios.parquet', engine='pyarrow')
g_sitios.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277432 entries, 125 to 2998423
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 277432 non-null  object 
 1   address              274928 non-null  object 
 2   gmap_id              277432 non-null  object 
 3   description          89796 non-null   object 
 4   latitude             277432 non-null  float64
 5   longitude            277432 non-null  float64
 6   avg_rating           277432 non-null  float64
 7   num_of_reviews       277432 non-null  int64  
 8   price                119255 non-null  object 
 9   state                253839 non-null  object 
 10  url                  277432 non-null  object 
 11  monday               277432 non-null  object 
 12  tuesday              277432 non-null  object 
 13  wednesday            277432 non-null  object 
 14  thursday             277432 non-null  object 
 15  friday             

In [2]:
# Creamos la nueva tabla address que se conectara mediante gmap_id con la tabla principal.
address = g_sitios[['gmap_id','address', 'name']]
address.head()

Unnamed: 0,gmap_id,address,name
125,0x880e5523024703c1:0xb93a8ccb6918d616,"ROYAL LIQUOR, 26W211 Geneva Rd, Wheaton, IL 60187",ROYAL LIQUOR
126,0x80c8be4e73e8263f:0x3edb275a351e6266,"Onyx Bar, 11011 W Charleston Blvd, Las Vegas, ...",Onyx Bar
142,0x88f3fb847f6a4833:0x91a0e1ef28d778ed,"Macon Banquet, 3720 Bloomfield Village Dr, Mac...",Macon Banquet
144,0x87528bb7ab938cb3:0x5ee996e34cc50715,"Maverik Adventure's First Stop, 3330 S 500 W, ...",Maverik Adventure's First Stop
198,0x54906bbaf4811877:0x8a101329509628b,"Barrels Experience Wine, 7593 SE 27th St #103,...",Barrels Experience Wine


In [3]:
# Separar la columna 'address' en nuevas columnas
address_split = address['address'].str.split(',', n=3, expand=True)

# Cambiar el nombre de las columnas
address_split.columns = ['nombre', 'direccion', 'ciudad', 'cod.postal']

In [4]:
# Agregar la identificacion unica a las nuevas columnas separadas.
address = address[['gmap_id','address']].join(address_split)
address.head()

Unnamed: 0,gmap_id,address,nombre,direccion,ciudad,cod.postal
125,0x880e5523024703c1:0xb93a8ccb6918d616,"ROYAL LIQUOR, 26W211 Geneva Rd, Wheaton, IL 60187",ROYAL LIQUOR,26W211 Geneva Rd,Wheaton,IL 60187
126,0x80c8be4e73e8263f:0x3edb275a351e6266,"Onyx Bar, 11011 W Charleston Blvd, Las Vegas, ...",Onyx Bar,11011 W Charleston Blvd,Las Vegas,NV 89135
142,0x88f3fb847f6a4833:0x91a0e1ef28d778ed,"Macon Banquet, 3720 Bloomfield Village Dr, Mac...",Macon Banquet,3720 Bloomfield Village Dr,Macon,GA 31206
144,0x87528bb7ab938cb3:0x5ee996e34cc50715,"Maverik Adventure's First Stop, 3330 S 500 W, ...",Maverik Adventure's First Stop,3330 S 500 W,South Salt Lake,UT 84115
198,0x54906bbaf4811877:0x8a101329509628b,"Barrels Experience Wine, 7593 SE 27th St #103,...",Barrels Experience Wine,7593 SE 27th St #103,Mercer Island,WA 98040


In [5]:
# Agregar una columna nueva llamada estados.
address['estado'] = None

# Listas de los 51 códigos postales y nombres de estados.
codigos_postales = [
    'CA', 'NY', 'IA', 'GA', 'FL', 'TX', 'LA', 'OR', 'WV', 'VA', 
    'AR', 'PA', 'NM', 'NC', 'TN', 'WI', 'NJ', 'IN', 'IL', 'DC', 
    'MD', 'ME', 'NE', 'WA', 'MI', 'OH', 'OK', 'MO', 'KS', 'UT', 
    'HI', 'NV', 'AZ', 'AL', 'CO', 'MA', 'ID', 'SC', 'RI', 'KY', 
    'AK', 'MT', 'MN', 'CT', 'MS', 'SD', 'WY', 'NH', 'DE', 'VT', 
    'ND'
]

nombres_estados = [
    'California', 'New York', 'Iowa', 'Georgia', 'Florida', 'Texas', 
    'Louisiana', 'Oregon', 'West Virginia', 'Virginia', 'Arkansas', 
    'Pennsylvania', 'New Mexico', 'North Carolina', 'Tennessee', 'Wisconsin', 
    'New Jersey', 'Indiana', 'Illinois', 'District of Columbia', 'Maryland', 
    'Maine', 'Nebraska', 'Washington', 'Michigan', 'Ohio', 'Oklahoma', 
    'Missouri', 'Kansas', 'Utah', 'Hawaii', 'Nevada', 'Arizona', 
    'Alabama', 'Colorado', 'Massachusetts', 'Idaho', 'South Carolina', 
    'Rhode Island', 'Kentucky', 'Alaska', 'Montana', 'Minnesota', 
    'Connecticut', 'Mississippi', 'South Dakota', 'Wyoming', 
    'New Hampshire', 'Delaware', 'Vermont', 'North Dakota'
]

In [6]:
# cod.postal contiene las letras del estado, podemos filtrar los estados rapidamente, a travez de un bucle entre codigo y nombres.
for codigo, estado in zip(codigos_postales, nombres_estados):
    address.loc[address['cod.postal'].str.contains(codigo, na=False), 'estado'] = estado

In [7]:
# El cod.postal puede ser nulo por falta de informacion ya que se corre la informacion hacia la izquierda.
# Para solucionar esto filtramos los cod.postales nulos que se corrieron hacia la izquierda
for codigo, estado in zip(codigos_postales, nombres_estados):
    address.loc[(address['cod.postal'].isnull()) & (address['ciudad'].str.contains(codigo, na=False)), 'estado'] = estado

In [8]:
# Ahora que estan corregidos las mayorias de los cod.postal algunos datos se cargaron al reves, conteniendo el cod en nombre y el nombre en cod.postal.
for estado in nombres_estados:
    address.loc[(address['estado'].isnull()) & (address['nombre'].str.contains(estado, na=False)), 'estado'] = estado

In [9]:
# Eliminar filas donde al menos 5 columnas son nulas
columnas = ['gmap_id', 'nombre', 'direccion', 'ciudad', 'cod.postal', 'estado']
address = address.dropna(thresh=len(columnas) - 1)

In [11]:
# Ultimas condicionales: el cod.postal no es nulo, pero su codigo aun se encuentra en la ciudad.
for codigo, estado in zip(codigos_postales, nombres_estados):
    address.loc[(address['cod.postal'].notnull()) & (address['ciudad'].str.contains(codigo, na=False)), 'estado'] = estado

# El cod.postal no es nulo, pero esta escrito el estado en la ciudad.  
for codigo, estado in zip(codigos_postales, nombres_estados):
    address.loc[(address['cod.postal'].notnull()) & (address['ciudad'].str.contains(estado, na=False)), 'estado'] = estado
    
# El cod.postal contiene escrito el estado y no un codigo.  
for codigo, estado in zip(codigos_postales, nombres_estados):
    address.loc[(address['cod.postal'].notnull()) & (address['cod.postal'].str.contains(estado, na=False)), 'estado'] = estado

In [12]:
# Terminada la normalizacion solo quedaron muy pocas filas donde no se puede saber su estado. Se eliminaran.
address = address.dropna(subset=['estado'])
address = address.drop(columns=['nombre'])

# Agregamos un identificador unico a cada estado para que se conecte con la tabla principal g_sitios.
address['id_estado'] = address['estado'].factorize()[0] + 1
address['id_estado'] = address['id_estado'].astype('int32')

In [13]:
# Mezclamos la informacion con la tabla principal.
g_sitios = g_sitios.merge(address[['gmap_id', 'id_estado']], on='gmap_id', how='left')
g_sitios = g_sitios.drop(columns=['address'])
g_sitios = g_sitios.drop(columns=['state']) # Informacion estatica. No sirve

# Borrar filas con id_estado nulos, ya que no van a tener informacion para conectarse con reviews sin un estado.
g_sitios = g_sitios.dropna(subset=['id_estado'])
g_sitios['id_estado'] = g_sitios['id_estado'].astype('int32') # tipo de dato entero.

In [14]:
# Optimizando tipo de datos.
g_sitios['latitude'] = g_sitios['latitude'].astype('float32')
g_sitios['longitude'] = g_sitios['longitude'].astype('float32')
#  Extraemos las columnas latitud y longitud de g_sitios.
lat_long = g_sitios[['gmap_id', 'latitude', 'longitude']]

In [15]:
# Merge de lat_long a address usando id_negocio como clave.
address = address.merge(lat_long[['latitude','longitude','gmap_id']], on='gmap_id', how='left')
# Eliminar las columnas latitud y longitud de g_sitios
g_sitios = g_sitios.drop(columns=['latitude', 'longitude'])

In [20]:
# Contamos la cantidad de restaurantes y hoteles que contienen los estados, para elegir un Top 5 de mayor cantidad de negocios 
conteo_negocios_por_estado = g_sitios.groupby('id_estado').size().reset_index(name='cantidad_negocios')
conteo_negocios_por_estado.sort_values(by='cantidad_negocios', ascending=False).head(5)

Unnamed: 0,id_estado,cantidad_negocios
8,9,35389
23,24,24462
17,18,24239
6,7,17266
12,13,10885


In [24]:
# Filtrar las filas donde 'id_estado' sea 24 o 9
filtered_address = address[address['id_estado'].isin([9,24,18,7,13])]

# Mostrar solo las columnas 'estado' e 'id_estado'
filtered_address = filtered_address[['estado', 'id_estado']]
filtered_address.value_counts()

estado        id_estado
California    9            35389
New York      24           24462
Texas         18           24239
Florida       7            17266
Pennsylvania  13           10885
Name: count, dtype: int64

In [None]:
google_estados = [9,24,18,7,13]
g_sitios = g_sitios[g_sitios['id_estado'].isin(google_estados)]
g_sitios = g_sitios.drop(columns=['price','description'])
g_sitios.reset_index()
g_sitios.info()

<class 'pandas.core.frame.DataFrame'>
Index: 112241 entries, 6 to 277431
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 112241 non-null  object 
 1   gmap_id              112241 non-null  object 
 2   avg_rating           112241 non-null  float64
 3   num_of_reviews       112241 non-null  int64  
 4   url                  112241 non-null  object 
 5   monday               112241 non-null  object 
 6   tuesday              112241 non-null  object 
 7   wednesday            112241 non-null  object 
 8   thursday             112241 non-null  object 
 9   friday               112241 non-null  object 
 10  saturday             112241 non-null  object 
 11  sunday               112241 non-null  object 
 12  category_general_id  112241 non-null  int64  
 13  id_estado            112241 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(10)
memory usage: 12.4+ MB


In [47]:
address = pd.read_parquet(r'C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\v0_address.parquet', engine='pyarrow')

In [56]:
address['id_estado'] = address['id_estado'].astype(int)
# Terminado la informacion. guardamos el dataframe en parquet.
os.makedirs('../../../data', exist_ok=True)
address.to_parquet('../../../data/v0_address.parquet', index=False)

# Muestra parcial de la tabla:
address.head()

Unnamed: 0,gmap_id,address,direccion,ciudad,cod.postal,estado,latitude,longitude,id_estado
0,0x88dae191ee505917:0x6ba3e25388d3fad4,"Oneyda's Bakery, 600 Goodlette-Frank Rd #101, ...",600 Goodlette-Frank Rd #101,Naples,FL 34102,Florida,26.154755,-81.790527,7
1,0x808f879f35b5088b:0xe3541cec7a95bd88,"TACOS LA CABANA, 2015 22nd Ave, Oakland, CA 94606",2015 22nd Ave,Oakland,CA 94606,California,37.789074,-122.233887,9
2,0x88d9b99475d9fd7b:0xea6083d207b2471a,"Tropical Park Liquors, 7971 SW 40th St Suite #...",7971 SW 40th St Suite #22,Miami,FL 33155,Florida,25.733915,-80.325218,7
3,0x89c88de475520cc7:0xeff46469445b5212,"The Nutrition Group, 5 Interchange Pl, York, P...",5 Interchange Pl,York,PA 17406,Pennsylvania,40.018829,-76.739456,13
4,0x864e9891e381f3df:0x4cefe6219bc9199c,"Top Cat Seafood Restaurant, 3117 Martin Luther...",3117 Martin Luther King Jr Blvd,Dallas,TX 75215,Texas,32.773129,-96.764481,18


In [None]:
# Muestra parcial de la tabla:
address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112241 entries, 0 to 112240
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   gmap_id     112241 non-null  object 
 1   address     112241 non-null  object 
 2   direccion   112241 non-null  object 
 3   ciudad      112239 non-null  object 
 4   cod.postal  110972 non-null  object 
 5   estado      112241 non-null  object 
 6   latitude    112241 non-null  float32
 7   longitude   112241 non-null  float32
 8   id_estado   112241 non-null  int64  
dtypes: float32(2), int64(1), object(6)
memory usage: 6.9+ MB


In [45]:
# Terminado la informacion. guardamos el dataframe principal actualizado en parquet.
os.makedirs('../../../data', exist_ok=True)
g_sitios.to_parquet('../../../data/v1.g_sitios.parquet', index=False)

# Muestra parcial de la tabla:
g_sitios.head()

Unnamed: 0,name,gmap_id,avg_rating,num_of_reviews,url,monday,tuesday,wednesday,thursday,friday,saturday,sunday,category_general_id,id_estado
6,Oneyda's Bakery,0x88dae191ee505917:0x6ba3e25388d3fad4,4.6,19,https://www.google.com/maps/place//data=!4m2!3...,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,Closed,4,7
8,TACOS LA CABANA,0x808f879f35b5088b:0xe3541cec7a95bd88,5.0,2,https://www.google.com/maps/place//data=!4m2!3...,5–11PM,Closed,Closed,Closed,5–11PM,5–11PM,5–11PM,3,9
9,Tropical Park Liquors,0x88d9b99475d9fd7b:0xea6083d207b2471a,4.7,8,https://www.google.com/maps/place//data=!4m2!3...,10AM–10PM,10AM–10PM,10AM–12AM,10AM–12AM,10AM–12AM,10AM–12AM,10AM–10PM,4,7
14,The Nutrition Group,0x89c88de475520cc7:0xeff46469445b5212,3.2,17,https://www.google.com/maps/place//data=!4m2!3...,9AM–8PM,9AM–8PM,9AM–8PM,9AM–8PM,9AM–8PM,9AM–5PM,Closed,4,13
21,Top Cat Seafood Restaurant,0x864e9891e381f3df:0x4cefe6219bc9199c,3.9,8,https://www.google.com/maps/place//data=!4m2!3...,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,3,18


In [46]:
# Muestra parcial de la tabla:
g_sitios.info()

<class 'pandas.core.frame.DataFrame'>
Index: 112241 entries, 6 to 277431
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   name                 112241 non-null  object 
 1   gmap_id              112241 non-null  object 
 2   avg_rating           112241 non-null  float64
 3   num_of_reviews       112241 non-null  int64  
 4   url                  112241 non-null  object 
 5   monday               112241 non-null  object 
 6   tuesday              112241 non-null  object 
 7   wednesday            112241 non-null  object 
 8   thursday             112241 non-null  object 
 9   friday               112241 non-null  object 
 10  saturday             112241 non-null  object 
 11  sunday               112241 non-null  object 
 12  category_general_id  112241 non-null  int64  
 13  id_estado            112241 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(10)
memory usage: 12.4+ MB


Importare los dataset para cambiar el nombre de las columnas llamadas "gmap_id" por "id_negocio".

In [3]:
import pandas as pd
g_sitios = pd.read_parquet(r"C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\v1.g_sitios.parquet", engine='pyarrow')
g_sitios

Unnamed: 0,name,gmap_id,avg_rating,num_of_reviews,url,monday,tuesday,wednesday,thursday,friday,saturday,sunday,category_general_id,id_estado
0,Oneyda's Bakery,0x88dae191ee505917:0x6ba3e25388d3fad4,4.6,19,https://www.google.com/maps/place//data=!4m2!3...,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,8AM–6PM,Closed,4,7
1,TACOS LA CABANA,0x808f879f35b5088b:0xe3541cec7a95bd88,5.0,2,https://www.google.com/maps/place//data=!4m2!3...,5–11PM,Closed,Closed,Closed,5–11PM,5–11PM,5–11PM,3,9
2,Tropical Park Liquors,0x88d9b99475d9fd7b:0xea6083d207b2471a,4.7,8,https://www.google.com/maps/place//data=!4m2!3...,10AM–10PM,10AM–10PM,10AM–12AM,10AM–12AM,10AM–12AM,10AM–12AM,10AM–10PM,4,7
3,The Nutrition Group,0x89c88de475520cc7:0xeff46469445b5212,3.2,17,https://www.google.com/maps/place//data=!4m2!3...,9AM–8PM,9AM–8PM,9AM–8PM,9AM–8PM,9AM–8PM,9AM–5PM,Closed,4,13
4,Top Cat Seafood Restaurant,0x864e9891e381f3df:0x4cefe6219bc9199c,3.9,8,https://www.google.com/maps/place//data=!4m2!3...,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,12–8PM,3,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112236,Subpreme Grill,0x88d9dd6e78851851:0x849dcf273582261c,4.3,393,https://www.google.com/maps/place//data=!4m2!3...,10AM–8PM,10AM–8PM,10AM–8PM,10AM–8PM,10AM–8PM,10AM–8PM,10AM–7PM,4,7
112237,Dunkin',0x89c6c4072624d2a9:0x4437b2d55e9a5811,4.0,348,https://www.google.com/maps/place//data=!4m2!3...,5:30AM–7:30PM,5:30AM–7:30PM,5:30AM–7:30PM,5:30AM–7:30PM,5:30AM–7:30PM,5:30AM–7:30PM,5:30AM–7PM,4,13
112238,Reuben's Marc,0x89c6b993431d9a19:0xea378de3c317996e,4.0,223,https://www.google.com/maps/place//data=!4m2!3...,4PM–12AM,4PM–12AM,4PM–12AM,4PM–2AM,4PM–2AM,4PM–2AM,Closed,3,13
112239,Palm Gardens,0x89da6b8177ee49d5:0xbc9dfc53722fe1,4.0,128,https://www.google.com/maps/place//data=!4m2!3...,8AM–2AM,8AM–2AM,8AM–2AM,8AM–2AM,8AM–2AM,8AM–2AM,8AM–2AM,3,24


In [None]:
address = pd.read_parquet(r"C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\v0_address.parquet", engine='pyarrow')
address

In [6]:
address = address.rename(columns={'gmap_id': 'id_negocio'})
g_sitios = g_sitios.rename(columns={'gmap_id': 'id_negocio'})

In [7]:
address.to_parquet("v0_address.parquet")
g_sitios.to_parquet("v1_g_sitios.parquet")

In [9]:
address = address.rename(columns={'cod.postal': 'cod_postal'})
address.to_parquet("v0_address.parquet")


In [3]:
import pandas as pd
address = pd.read_parquet(r"C:\Users\mauri\OneDrive\Escritorio\proyectogrupal\data\v0_address.parquet", engine='pyarrow')
address.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112241 entries, 0 to 112240
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id_negocio  112241 non-null  object 
 1   address     112241 non-null  object 
 2   direccion   112241 non-null  object 
 3   ciudad      112239 non-null  object 
 4   cod_postal  110972 non-null  object 
 5   estado      112241 non-null  object 
 6   latitude    112241 non-null  float32
 7   longitude   112241 non-null  float32
 8   id_estado   112241 non-null  int64  
dtypes: float32(2), int64(1), object(6)
memory usage: 6.9+ MB


In [10]:
# Supongamos que tienes un DataFrame llamado df
# Supongamos que tu DataFrame se llama df

# Localiza las filas donde el 'Address' sea igual a 'Bubba\'s Texas Cookin\', Montana 59912'
address.loc[address['address'] == "Bubba's Texas Cookin', Montana 59912", 'ciudad'] = 'Columbia Falls'
address.loc[address['address'] == "Bubba's Texas Cookin', Montana 59912", 'cod_postal'] = 'MT 59912'

address.loc[address['address'] == "New York, 美国明记小炒邮政编码: 10002", 'address'] = '237 Grand St'
address.loc[address['id_negocio'] == "0x89c259e473970ed3:0xa76c925ac50da6c6", 'ciudad'] = 'New York'
address.loc[address['id_negocio'] == "0x89c259e473970ed3:0xa76c925ac50da6c6", 'cod_postal'] = 'NY 10002'


address['cod_postal'] = address['cod_postal'].fillna(address['ciudad'])


In [15]:
address.to_parquet("v1_address.parquet")
