# 🔎 Análisis Exploratorio de Datos (Yelp):

## Importación de librerias

In [26]:
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from data_utils import data_type_check
import json

### Importación de los datasets

In [46]:
df_business = pd.read_parquet('../0_Dataset/Data_Limpia/Yelp/business.parquet',engine='pyarrow')
df_checkin = pd.read_parquet('../0_Dataset/Data_Limpia/Yelp/checkin_reducido.parquet',engine='pyarrow')
df_review = pd.read_parquet('../0_Dataset/Data_Limpia/Yelp/review_FL_reducido.parquet',engine='pyarrow')
df_tip = pd.read_parquet('../0_Dataset/Data_Limpia/Yelp/tip.parquet',engine='pyarrow')
df_user = pd.read_parquet('../0_Dataset/Data_Limpia/Yelp/user_reducido.parquet',engine='pyarrow')

df_business

df_checkin

df_review

df_tip

df_user

### Funciones importantes

In [14]:
# Resumen de datos
def data_summary(df):
    print("----- Shape of the data -----")
    print(df.shape)
    print("\n----- Data Types -----")
    print(df.dtypes)
    print("\n----- Missing Values -----")
    print(df.isnull().sum())
    print("\n----- Summary Statistics -----")
    print(df.describe())

In [15]:
# Distribución de variables numéricas
def plot_numeric_distributions(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_columns].hist(bins=15, figsize=(15, 6), layout=(2, len(numeric_columns)//2 + 1))
    plt.tight_layout()
    plt.show()

In [16]:
# Distribución de variables categóricas
def plot_categorical_distributions(df):
    categorical_columns = df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        plt.figure(figsize=(10, 4))
        sns.countplot(y=column, data=df)
        plt.title(f'Distribution of {column}')
        plt.show()

In [17]:
# Matriz de correlación
def plot_correlation_matrix(df):
    plt.figure(figsize=(10, 8))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()

In [18]:
# Diagramas de dispersión para variables numéricas
def plot_scatter_matrix(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    sns.pairplot(df[numeric_columns])
    plt.show()

In [19]:
# Diagramas de caja para identificar valores atípicos
def plot_boxplots(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    for column in numeric_columns:
        plt.figure(figsize=(10, 4))
        sns.boxplot(x=column, data=df)
        plt.title(f'Boxplot of {column}')
        plt.show()

In [20]:
# FUNCION PARA ENCONTRAR EL RANGO MINIMO Y MAXIMO DEL IQR
def calculate_iqr_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return float(lower_bound), float(upper_bound)

## EDA

Verificación de los datos

In [12]:
print(data_type_check(df_business))
print(data_type_check(df_checkin))
print(data_type_check(df_review))
print(data_type_check(df_tip))
print(data_type_check(df_user))


 Resumen del dataframe:

Dimensiones:  (668575, 15)
         columna  %_no_nulos  %_nulos  total_nulos tipo_dato
0    business_id       100.0      0.0            0    object
1           name       100.0      0.0            0    object
2        address       100.0      0.0            0    object
3           city       100.0      0.0            0    object
4          state       100.0      0.0            0    object
5    postal_code       100.0      0.0            0    object
6       latitude       100.0      0.0            0   float64
7      longitude       100.0      0.0            0   float64
8          stars       100.0      0.0            0   float64
9   review_count       100.0      0.0            0     int64
10       is_open       100.0      0.0            0     int64
11    attributes       100.0      0.0            0    object
12    categories       100.0      0.0            0    object
13         hours       100.0      0.0            0    object
14  parsed_hours       100.0    

In [30]:
# Tu JSON de ejemplo
data_json = '''
{
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",
    "name": "Garaje",
    "address": "475 3rd St",
    "city": "San Francisco",
    "state": "CA",
    "postal code": "94107",
    "latitude": 37.7817529521,
    "longitude": -122.39612197,
    "stars": 4.5,
    "review_count": 1198,
    "is_open": 1,
    "attributes": {
        "RestaurantsTakeOut": true,
        "BusinessParking": {
            "garage": false,
            "street": true,
            "validated": false,
            "lot": false,
            "valet": false
        }
    },
    "categories": [
        "Mexican",
        "Burgers",
        "Gastropubs"
    ],
    "hours": {
        "Monday": "10:00-21:00",
        "Tuesday": "10:00-21:00",
        "Friday": "10:00-21:00",
        "Wednesday": "10:00-21:00",
        "Thursday": "10:00-21:00",
        "Sunday": "11:00-18:00",
        "Saturday": "10:00-21:00"
    }
}
'''

# Cargar JSON
data = json.loads(data_json)

# Convertir a DataFrame
df = pd.json_normalize(df_business)

# Mostrar el DataFrame
print(df)


Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


### Business

In [67]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,parsed_hours
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Restaurants,"{""Friday"": ""7:0-21:0"", ""Monday"": ""7:0-20:0"", ""...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Food,"{""Friday"": ""7:0-21:0"", ""Monday"": ""7:0-20:0"", ""...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Bubble Tea,"{""Friday"": ""7:0-21:0"", ""Monday"": ""7:0-20:0"", ""...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Coffee & Tea,"{""Friday"": ""7:0-21:0"", ""Monday"": ""7:0-20:0"", ""...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Bakeries,"{""Friday"": ""7:0-21:0"", ""Monday"": ""7:0-20:0"", ""...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668570,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,AB,62025,38.782351,-89.950558,4.0,24,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Bikes,"{""Friday"": ""9:0-20:0"", ""Monday"": ""9:0-20:0"", ""...","{'Friday': '9:0-20:0', 'Monday': '9:0-20:0', '..."
668571,jV_XOycEzSlTx-65W906pg,Sic Ink,238 Apollo Beach Blvd,Apollo beach,TN,33572,27.771002,-82.394910,4.5,9,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Beauty & Spas,"{""Friday"": ""12:0-19:0"", ""Monday"": null, ""Satur...","{'Friday': '12:0-19:0', 'Monday': None, 'Satur..."
668572,jV_XOycEzSlTx-65W906pg,Sic Ink,238 Apollo Beach Blvd,Apollo beach,TN,33572,27.771002,-82.394910,4.5,9,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Permanent Makeup,"{""Friday"": ""12:0-19:0"", ""Monday"": null, ""Satur...","{'Friday': '12:0-19:0', 'Monday': None, 'Satur..."
668573,jV_XOycEzSlTx-65W906pg,Sic Ink,238 Apollo Beach Blvd,Apollo beach,TN,33572,27.771002,-82.394910,4.5,9,1,"{""AcceptsInsurance"": null, ""AgesAllowed"": null...",Piercing,"{""Friday"": ""12:0-19:0"", ""Monday"": null, ""Satur...","{'Friday': '12:0-19:0', 'Monday': None, 'Satur..."


In [47]:
data_summary(df_business)

----- Shape of the data -----
(668575, 15)

----- Data Types -----
business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
parsed_hours     object
dtype: object

----- Missing Values -----
business_id     0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
is_open         0
attributes      0
categories      0
hours           0
parsed_hours    0
dtype: int64

----- Summary Statistics -----
            latitude      longitude          stars   review_count  \
count  668575.000000  668575.000000  668575.000000  668575.000000   
mean       36.573029     -89.386457       3.643918      52.572717   
std      

In [44]:
df_business.head(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,parsed_hours
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,Restaurants,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,Food,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,Bubble Tea,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,Coffee & Tea,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,Bakeries,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '..."


In [53]:
print(type(df_business.attributes[0]))
print(df_business.attributes[0])

<class 'str'>
{"AcceptsInsurance": null, "AgesAllowed": null, "Alcohol": "u'none'", "Ambience": null, "BYOB": null, "BYOBCorkage": null, "BestNights": null, "BikeParking": "True", "BusinessAcceptsBitcoin": null, "BusinessAcceptsCreditCards": "False", "BusinessParking": "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}", "ByAppointmentOnly": "False", "Caters": "True", "CoatCheck": null, "Corkage": null, "DietaryRestrictions": null, "DogsAllowed": null, "DriveThru": null, "GoodForDancing": null, "GoodForKids": null, "GoodForMeal": null, "HairSpecializesIn": null, "HappyHour": null, "HasTV": null, "Music": null, "NoiseLevel": null, "Open24Hours": null, "OutdoorSeating": "False", "RestaurantsAttire": null, "RestaurantsCounterService": null, "RestaurantsDelivery": "False", "RestaurantsGoodForGroups": null, "RestaurantsPriceRange2": "1", "RestaurantsReservations": null, "RestaurantsTableService": null, "RestaurantsTakeOut": "True", "Smoking": null, "Wheelch

In [57]:
# Verificar y limpiar los datos antes de aplicar json.loads
def safe_json_loads(s):
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        return None

In [58]:
atributos = df_business['attributes'].apply(safe_json_loads)

In [63]:
horas = df_business['hours'].apply(safe_json_loads)

In [64]:
horas

0         {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...
1         {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...
2         {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...
3         {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...
4         {'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...
                                ...                        
668570    {'Friday': '9:0-20:0', 'Monday': '9:0-20:0', '...
668571    {'Friday': '12:0-19:0', 'Monday': None, 'Satur...
668572    {'Friday': '12:0-19:0', 'Monday': None, 'Satur...
668573    {'Friday': '12:0-19:0', 'Monday': None, 'Satur...
668574    {'Friday': '12:0-19:0', 'Monday': None, 'Satur...
Name: hours, Length: 668575, dtype: object

In [65]:
horas_df = pd.json_normalize(horas)
horas_df.columns = ['hour_' + col for col in horas_df.columns]

In [66]:
horas_df

Unnamed: 0,hour_Friday,hour_Monday,hour_Saturday,hour_Sunday,hour_Thursday,hour_Tuesday,hour_Wednesday
0,7:0-21:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0
1,7:0-21:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0
2,7:0-21:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0
3,7:0-21:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0
4,7:0-21:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0
...,...,...,...,...,...,...,...
668570,9:0-20:0,9:0-20:0,9:0-17:0,10:0-17:0,9:0-20:0,9:0-20:0,9:0-20:0
668571,12:0-19:0,,12:0-19:0,,12:0-19:0,12:0-19:0,12:0-19:0
668572,12:0-19:0,,12:0-19:0,,12:0-19:0,12:0-19:0,12:0-19:0
668573,12:0-19:0,,12:0-19:0,,12:0-19:0,12:0-19:0,12:0-19:0


In [68]:
data_type_check(horas_df)


 Resumen del dataframe:

Dimensiones:  (668575, 7)
          columna  %_no_nulos  %_nulos  total_nulos tipo_dato
0     hour_Friday       87.45    12.55        83929    object
1     hour_Monday       80.15    19.85       132710    object
2   hour_Saturday       77.75    22.25       148755    object
3     hour_Sunday       57.43    42.57       284635    object
4   hour_Thursday       87.57    12.43        83090    object
5    hour_Tuesday       84.12    15.88       106190    object
6  hour_Wednesday       86.46    13.54        90549    object


In [61]:
atributos
attributes_df = pd.json_normalize(atributos)
attributes_df.columns = ['attributes_' + col for col in attributes_df.columns]

In [70]:
attributes_df.attributes_WiFi.value_counts()

attributes_WiFi
u'free'    141061
u'no'       67433
'free'      35674
'no'        27233
u'paid'      2347
'paid'        565
None          263
Name: count, dtype: int64

In [48]:
# Normalizar la columna 'attributes'
attributes_df = pd.json_normalize(df_business['attributes'])
# Renombrar columnas para que no haya conflictos
attributes_df.columns = ['attributes_' + col for col in attributes_df.columns]
# Concatenar el DataFrame original con el DataFrame normalizado
# df_business = pd.concat([df_business.drop(columns=['attributes']), attributes_df], axis=1)

# Normalizar la columna 'hours'
# hours_df = pd.json_normalize(df_business['hours'])
# Renombrar columnas para que no haya conflictos
# hours_df.columns = ['hours_' + col for col in hours_df.columns]
# Concatenar el DataFrame original con el DataFrame normalizado
# df_business = pd.concat([df_business.drop(columns=['hours']), hours_df], axis=1)

In [49]:
attributes_df

0
1
2
3
4
...
668570
668571
668572
668573
668574


Verificación de los datasets

In [7]:
df_checkin

Unnamed: 0,business_id,date
40660,IjAjccegjWKQPu41iuMpZQ,"2015-08-19 22:01:18, 2015-08-26 20:35:30, 2015..."
43010,JrYWF3BWf1GYk8hnF2Cz4w,"2015-08-13 22:59:33, 2015-12-31 19:23:01, 2016..."
95361,iKjXWDjdv9N7fT6kgGKM6g,"2015-06-11 17:42:50, 2015-09-25 22:00:32, 2015..."
94163,hkuIjzyewr7RyIbKvjUQ5A,"2019-06-25 23:10:05, 2019-10-16 18:21:41, 2019..."
84056,crgS4ogZEt2rYGwYVfeK2A,"2016-09-02 22:34:41, 2016-09-05 21:20:35, 2016..."
...,...,...
17489,7Opw4n6-j_j3BF1gURUi2A,"2014-11-16 17:01:31, 2015-04-01 19:12:57, 2016..."
34682,FkMO7IzhoeY5SinJwaXtcg,"2014-07-09 08:23:15, 2014-07-27 00:49:27, 2014..."
32815,EoFqLW7OKcsZAJh_jJP4mA,"2011-03-06 03:25:31, 2011-10-08 03:22:42, 2011..."
25048,B0SPP0RsKkJxf6zK6mG9kw,"2012-06-25 16:34:42, 2013-01-20 00:11:55, 2013..."
