# `menuitem.csv` and `dish.csv` __Data Cleaning__

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./MenuItem.csv")

# __Step 1__

- Missing Value Correction

In [3]:
# Create a new DataFrame for wrangling
df_cleaned = df

# Replace empty strings with NaN
df_cleaned.replace("", pd.NA, inplace=True)

# Remove the rows with missing values in the "price" column
df_cleaned = df_cleaned.dropna(subset=["price"])

In [4]:
# Save the intermediately cleaned dataset
df_cleaned.to_csv("./CleanedMenuItemStep1.csv", index=False)

## __Step 2__

- Deduplication


In [5]:
df = pd.read_csv("./CleanedMenuItemStep1.csv")
dish_df = pd.read_csv("./Dish.csv")

In [6]:
def standardize_name(name):
    if pd.isna(name):
        return name
    return ' '.join(word.capitalize() for word in name.split())

In [7]:
# Apply the function to the 'name' column
dish_df['name'] = dish_df['name'].apply(standardize_name)

In [8]:
duplicates = dish_df[dish_df.duplicated(subset='name', keep=False)]

In [9]:
duplicate_groups = duplicates.groupby('name')['id'].apply(list).reset_index()

In [10]:
# Dictionary to map old IDs to new IDs
id_mapping = {}

for _, row in duplicate_groups.iterrows():
    name = row['name']
    ids = row['id']
    # Keep the first ID, replace others
    first_id = ids[0]
    for duplicate_id in ids[1:]:
        id_mapping[duplicate_id] = first_id

In [11]:
# Update MenuItem.csv
df['dish_id'] = df['dish_id'].replace(id_mapping)

In [12]:
df.to_csv("./CleanedMenuItemStep2.csv", index=False)