In [1]:
import pandas as pd

# Leer CSV ignorando líneas problemáticas
df = pd.read_csv(
    r"C:\Users\beggy\OneDrive\Desktop\Nueva carpeta\picon_granada.csv",
    on_bad_lines='skip',   # salta las líneas que dan error
    dtype=str               # leemos todo como texto primero
)

print("Dimensiones después de saltar líneas problemáticas:", df.shape)
print(df.head())


Dimensiones después de saltar líneas problemáticas: (4976, 19)
                     createdOn changeOperation       market     city  \
0  2022-10-27 23:07:19.0000000          Create  Houston, TX  Houston   
1  2022-10-27 23:07:26.0000000          Delete  Houston, TX  Houston   
2  2022-10-27 23:07:26.0000000          Delete  Houston, TX  Houston   
3  2022-10-27 23:07:33.0000000          Delete  Houston, TX  Houston   
4  2022-10-27 23:07:33.0000000          Delete  Houston, TX  Houston   

                  menuItemName  \
0   1% Low Fat Milk (110 Cals)   
1  10-Piece Boneless Wing Meal   
2   8-Piece Boneless Wing Meal   
3                   LUNCH DUET   
4                 FILET MIGNON   

                                 menuItemDescription menuItemCurrentPrice  \
0  As delicious as our sandwiches are, they are e...                $2.19   
1                     Served with fries and a drink.               $14.19   
2                     Served with fries and a drink.               $

In [2]:
# Ver los valores que no se pueden convertir a float
mask = pd.to_numeric(df['menuItemAverageRating'], errors='coerce').isna()
valores_no_numericos = df.loc[mask, 'menuItemAverageRating']
print(valores_no_numericos)


3884    New American Pizzeria
Name: menuItemAverageRating, dtype: object


In [3]:
# Limpiar y convertir precios a float
df['menuItemCurrentPrice'] = df['menuItemCurrentPrice'].replace('[\$,]', '', regex=True).astype(float)
df['menuItemPreviousPrice'] = df['menuItemPreviousPrice'].replace('[\$,]', '', regex=True).astype(float)


In [4]:
df[df['menuItemAverageRating'].str.contains('[A-Za-z]', na=False)]


Unnamed: 0,createdOn,changeOperation,market,city,menuItemName,menuItemDescription,menuItemCurrentPrice,menuItemPreviousPrice,menuItemImageUrl,menuItemCategory,menuItemAverageRating,menuItemRatingCount,restaurantName,restaurantDescription,restaurantAddress,restaurantImageUrl,restaurantPriceRange,restaurantLatitude,restaurantLongitude
3884,2022-11-07 21:15:11.0000000,Create,"Houston, TX",Houston,Cheesus Christ,"mozzarella, taleggio, parmigiano reggiano, bla...",22.0,,https://img.cdn4dd.com/cdn-cgi/image/fit=conta...,"Pizzas 12\"",4.73529386520386,34,Roberta’s""",New American Pizzeria,"401 Franklin St, Houston, TX 77201, USA",https://img.cdn4dd.com/cdn-cgi/image/fit=conta...,$$,29.7662149,-95.3648839,,,


In [5]:
# Convertir precios a float eliminando símbolos $
df['menuItemCurrentPrice'] = df['menuItemCurrentPrice'].replace('[\$,]', '', regex=True).astype(float)
df['menuItemPreviousPrice'] = df['menuItemPreviousPrice'].replace('[\$,]', '', regex=True)

# Forzar PreviousPrice a float y rellenar NaN con CurrentPrice
df['menuItemPreviousPrice'] = pd.to_numeric(df['menuItemPreviousPrice'], errors='coerce')
df['menuItemPreviousPrice'].fillna(df['menuItemCurrentPrice'], inplace=True)

# Limpiar ratings y conteos
df['menuItemAverageRating'] = pd.to_numeric(df['menuItemAverageRating'], errors='coerce')
df['menuItemRatingCount'] = pd.to_numeric(df['menuItemRatingCount'], errors='coerce')

# Eliminar filas donde AverageRating sea NaN (eran filas corruptas)
df = df.dropna(subset=['menuItemAverageRating'])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['menuItemPreviousPrice'].fillna(df['menuItemCurrentPrice'], inplace=True)


In [6]:
df[numeric_cols].describe()


NameError: name 'numeric_cols' is not defined

In [None]:
# Columnas numéricas a analizar
numeric_cols = ['menuItemCurrentPrice', 'menuItemPreviousPrice', 'menuItemAverageRating', 'menuItemRatingCount']

# Crear un diccionario para guardar los outliers
outliers = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filtrar outliers
    outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    
    print(f"Columna: {col}, Outliers: {len(outliers[col])}")


Columna: menuItemCurrentPrice, Outliers: 210
Columna: menuItemPreviousPrice, Outliers: 228
Columna: menuItemAverageRating, Outliers: 244
Columna: menuItemRatingCount, Outliers: 209


In [None]:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)


In [None]:
for col in numeric_cols:
    print(col, df[col].unique()[:10])


menuItemCurrentPrice [ 2.19 14.19 12.99 24.1  24.4  22.   19.   18.4  21.2  21.3 ]
menuItemPreviousPrice [ 2.19 14.19 12.99 24.1  24.4  22.   19.   18.4  21.2  21.3 ]
menuItemAverageRating [4.57420492 4.42056799 4.35029411 4.66587877 4.68148088 4.65513897
 4.3526969  4.83101177 4.31238794 4.72851801]
menuItemRatingCount [ 566. 6194. 2378. 8857.  405. 1041.  482. 1651.  557.  803.]
restaurantLatitude ['29.7466861' '29.7000121' '29.7560868' '29.7451531' '29.7701293'
 '29.7518483' '29.7426555' '29.7819194' '29.7382287' '29.7709432']
restaurantLongitude ['-95.359915' '-95.3600172' '-95.3697436' '-95.3776337' '-95.3817043'
 '-95.3769374' '-95.4034414' '-95.3902331' '-95.408481' '-95.3720701']


In [None]:
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [None]:
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [None]:
# Reemplazar valores perdidos
df['menuItemDescription'].fillna("Sin descripción", inplace=True)
df['menuItemImageUrl'].fillna("https://via.placeholder.com/150", inplace=True)
df['restaurantImageUrl'].fillna("https://via.placeholder.com/150", inplace=True)
df['restaurantPriceRange'].fillna("Desconocido", inplace=True)

# Para columnas numéricas, si hubiera algún NaN residual
numeric_cols = ['menuItemCurrentPrice', 'menuItemPreviousPrice', 
                'menuItemAverageRating', 'menuItemRatingCount',
                'restaurantLatitude', 'restaurantLongitude']

for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Confirmar que no hay valores perdidos
print(df.isnull().mean() * 100)


createdOn                0.0
changeOperation          0.0
market                   0.0
city                     0.0
menuItemName             0.0
menuItemDescription      0.0
menuItemCurrentPrice     0.0
menuItemPreviousPrice    0.0
menuItemImageUrl         0.0
menuItemCategory         0.0
menuItemAverageRating    0.0
menuItemRatingCount      0.0
restaurantName           0.0
restaurantDescription    0.0
restaurantAddress        0.0
restaurantImageUrl       0.0
restaurantPriceRange     0.0
restaurantLatitude       0.0
restaurantLongitude      0.0
dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['menuItemDescription'].fillna("Sin descripción", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['menuItemImageUrl'].fillna("https://via.placeholder.com/150", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

In [None]:
# Porcentaje de valores perdidos por columna
missing_percent = df.isna().mean() * 100
print(missing_percent.sort_values(ascending=False))


createdOn                0.0
changeOperation          0.0
market                   0.0
city                     0.0
menuItemName             0.0
menuItemDescription      0.0
menuItemCurrentPrice     0.0
menuItemPreviousPrice    0.0
menuItemImageUrl         0.0
menuItemCategory         0.0
menuItemAverageRating    0.0
menuItemRatingCount      0.0
restaurantName           0.0
restaurantDescription    0.0
restaurantAddress        0.0
restaurantImageUrl       0.0
restaurantPriceRange     0.0
restaurantLatitude       0.0
restaurantLongitude      0.0
dtype: float64


In [None]:
df.shape

(4975, 19)