# `menu.csv` __Data Cleaning__

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

In [4]:
df = pd.read_csv("./Menu.csv")

## __Step 1__

- Missing Value Correction
- Date Standardization, Date Outlier

### __Rationale:__

One might assume that missing values can be filled with derived data from foreign keys.

For example, if a date is missing for row X in the `Menu.csv` table but a `created_at`
or `updated_at` value exists for X.name -> menu_id -> menu_page_id, then the original
missing date can transitively become the existing `created_at` or `updated_at`. The same
example applies to the other two columns, `currency` and `location`.

The problem with the example above is that it presupposes the authenticity of the data
across multiple tables, which is dangerous without proper data validation methods.
Although a missing entry can have a derivable value based on data from a corresponding
data point tied to the missing entry's primary key, additional validation must occur
because the instance of missing data negatively impacts the trust in the data between
multiple tables for the same flawed primary key.

By inspection, there is a drastic reduction in rows after removing missing values;
however, deriving data across the tables cannot be validated for the scope of this
analysis. Regardless, there is ample data remaining to accomplish the target use case
set out in the initial proposal for an algorithmic pricing model based on price and
location over time.

In [5]:
df_cleaned = df

# Replace empty strings with NaN
df_cleaned.replace("", pd.NA, inplace=True)

# Remove the rows with missing values in the "date", "currency", and "location" columns
df_cleaned = df_cleaned.dropna(subset=["date"])
df_cleaned = df_cleaned.dropna(subset=["currency"])
df_cleaned = df_cleaned.dropna(subset=["location"])
display(df_cleaned)

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
6,12469,,HOTEL NETHERLAND,SUPPER,COMMERCIAL,"NEW YORK, [NY];",CARD; ILLUS; COL; 6.0X8.75;,,HOTEL CREST IN BLUE; PRICED MENU;,1900-2838,,,1900-04-16,Hotel Netherland,,Dollars,$,complete,2,144
11,12474,,ALPHA OF ZETA PSI,ANNUAL BANQUET,COMMERCIAL,"DELMONICO'S, [NEW YORK, NY];",BOOKLET; ILLUS; COL; 5.5X7.0;,,VELLUM COVER; CREST OF ZETA PSI; TIED WITH BLU...,1900-2844,,,1900-04-17,Alpha Of Zeta Psi,,Dollars,$,complete,12,38
12,12475,,MANHATTAN HOTEL,DINNER,COMMERCIAL,"NEW YORK, NY",CARD; ILLUS; 6X9.5;,,A LA CARTE DU JOUR; HOTEL SEAL AT TOP OF MENU;,1900-2847,,,1900-04-18,Manhattan Hotel,,Dollars,$,complete,2,176
20,12483,,MANHATTAN HOTEL,CARTE DU JOUR,COMMERCIAL,"NEW YORK, NY",CARD; ILLUS; 6X9.5;,,A LA CARTE MENU; HOTEL SEAL AT TOP OF MENU;,1900-2859,,,1900-04-18,Manhattan Hotel,,Dollars,$,complete,2,129
34,12498,,CAFE BOULEVARD,SUNDAY DINNER,COMMERCIAL,156 SECOND AVENUE (NY?),CARD; COL; ILLUS; 5 X 7;,,PRIX FIXE DINNER; FOOD APPEARS TO BE HUNGARIAN...,1900-2903,,,1900-04-22,Cafe Boulevard,,Cents,c,complete,2,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17537,35512,Whites,Whites,,,,,,4 images,1913-0743_wotm,,,1913-09-24,Whites,,Dollars,$,complete,4,297
17538,35513,Riggs',Riggs',,,,,,4 images,1913-0744_wotm,,,1913-09-24,Riggs',,Dollars,$,complete,4,321
17541,35516,Dennett's,Dennett's,,,,,,1 image,1913-0747_wotm,,,1913-09-24,Dennett's,,Dollars,$,complete,1,125
17542,35517,The Cortlandt,The Cortlandt,,,,,,1 image,1913-0748_wotm,,,1913-09-24,The Cortlandt,,Dollars,$,complete,1,101


In [6]:
# Standardize the entire date column to the ISO format

# Helper function to check and convert a date to ISO format
def to_iso_format(date_str):
    try:
        # Try to parse the date in various common formats
        date = pd.to_datetime(date_str, errors="raise")

        # Return the date in ISO format
        return date.strftime("%Y-%m-%d")
    except Exception as e:
        print(f"Error parsing date: {date_str} -> {e}")
        return None

# Apply the date function to the date column
df_cleaned["date"] = df_cleaned["date"].apply(to_iso_format)

# Drop rows where the date could not be parsed
df_cleaned = df_cleaned.dropna(subset=["date"])

display(df_cleaned)

Error parsing date: 2928-03-26 -> Out of bounds nanosecond timestamp: 2928-03-26, at position 0
Error parsing date: 0001-01-01 -> Out of bounds nanosecond timestamp: 0001-01-01, at position 0


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
6,12469,,HOTEL NETHERLAND,SUPPER,COMMERCIAL,"NEW YORK, [NY];",CARD; ILLUS; COL; 6.0X8.75;,,HOTEL CREST IN BLUE; PRICED MENU;,1900-2838,,,1900-04-16,Hotel Netherland,,Dollars,$,complete,2,144
11,12474,,ALPHA OF ZETA PSI,ANNUAL BANQUET,COMMERCIAL,"DELMONICO'S, [NEW YORK, NY];",BOOKLET; ILLUS; COL; 5.5X7.0;,,VELLUM COVER; CREST OF ZETA PSI; TIED WITH BLU...,1900-2844,,,1900-04-17,Alpha Of Zeta Psi,,Dollars,$,complete,12,38
12,12475,,MANHATTAN HOTEL,DINNER,COMMERCIAL,"NEW YORK, NY",CARD; ILLUS; 6X9.5;,,A LA CARTE DU JOUR; HOTEL SEAL AT TOP OF MENU;,1900-2847,,,1900-04-18,Manhattan Hotel,,Dollars,$,complete,2,176
20,12483,,MANHATTAN HOTEL,CARTE DU JOUR,COMMERCIAL,"NEW YORK, NY",CARD; ILLUS; 6X9.5;,,A LA CARTE MENU; HOTEL SEAL AT TOP OF MENU;,1900-2859,,,1900-04-18,Manhattan Hotel,,Dollars,$,complete,2,129
34,12498,,CAFE BOULEVARD,SUNDAY DINNER,COMMERCIAL,156 SECOND AVENUE (NY?),CARD; COL; ILLUS; 5 X 7;,,PRIX FIXE DINNER; FOOD APPEARS TO BE HUNGARIAN...,1900-2903,,,1900-04-22,Cafe Boulevard,,Cents,c,complete,2,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17537,35512,Whites,Whites,,,,,,4 images,1913-0743_wotm,,,1913-09-24,Whites,,Dollars,$,complete,4,297
17538,35513,Riggs',Riggs',,,,,,4 images,1913-0744_wotm,,,1913-09-24,Riggs',,Dollars,$,complete,4,321
17541,35516,Dennett's,Dennett's,,,,,,1 image,1913-0747_wotm,,,1913-09-24,Dennett's,,Dollars,$,complete,1,125
17542,35517,The Cortlandt,The Cortlandt,,,,,,1 image,1913-0748_wotm,,,1913-09-24,The Cortlandt,,Dollars,$,complete,1,101


In [7]:
# Inspect dates further to verify if more outliers exist

# Ensure the date column is in datetime format
df_cleaned_sorted_by_dates = df_cleaned
df_cleaned_sorted_by_dates["date"] = pd.to_datetime(df["date"], errors="coerce")

# Sort the DataFrame by the date column
df_cleaned_sorted_by_dates = df_cleaned_sorted_by_dates.sort_values(by="date")

__Note:__ the dates range from 1851 to 2015 which is reasonable for this dataset, meaning no further cleaning is necessary for the dates

In [8]:
# Display the earliest dates
display(df_cleaned_sorted_by_dates.head(10))

# Display the latest dates
display(df_cleaned_sorted_by_dates.tail(10))

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
4690,21075,,AMERICAN HOUSE,DAILY MENU,HOTEL,,"FOLDER, BUFF, ILLUSTRATION [HOTEL?]",,INCLUDES WINE LIST,1851-001,,,1851-01-01,American House,,Dollars,$,complete,4,90
5611,22229,,REVERE HOUSE,DAILY MENU,,"BOSTON, MA",SINGLE SHEET,,ILLUSTRATED [HOTEL?] WINE LIST,1851-003,,,1851-05-18,Revere House,,Dollars,$,complete,2,141
6903,23826,,COZZENS' WEST POINT HOTEL,DAILY MENU,,,SINGLE SHEET PRINTED ON BOTH SIDES,,NO ILLUSTRATIONS,1852-001,,,1852-09-03,Cozzens' West Point Hotel,,Dollars,$,complete,2,93
8706,26091,,IRVING HOUSE,DAILY MENU,,NEW YORK CITY,SINGLE SHEET,,"DOOLITTLE, CINCINNATI, BURROUGHS, PHILADELPHIA",1852-002,,,1852-10-12,Irving House,,Dollars,$,complete,2,190
8440,25769,,LAKE HOUSE,DINNER MENU,,"CHICAGO,ILL.",BROADSIDE,,INCLUDES WINE LIST,1854-006,,,1854-06-11,Lake House,,Dollars,$,complete,2,109
7609,24726,,AMERICAN HOTEL,TABLE D'HOTE,,"[BUFFALO, NY",ILLUS.;BROADSIDE,,INCLUDES WINE LIST,1855-001,,,1855-10-05,American Hotel,,Dollars,$,complete,2,72
5976,22685,,MART ACKERMAN'S SALOON,DAILY MENU,,"TORONTO, CANADA",FOLDER,,INCLUDES WINE LIST,1856-004,,,1856-01-01,Mart Ackerman's Saloon,,Canadian Dollars,C$,complete,4,156
5129,21635,,AMERICAN HOUSE,DAILY MENU,,"BOSTON,MA.",ILLUS.BROADSIDE,,INCLUDES WINE LIST,1856-002,,,1856-04-10,American House,,Dollars,$,under review,2,109
6590,23446,,CONGRESS HALL,DAILY MENU,,"SARATOGA SPRINGS,NY",BROADSIDE,,INCLUDES WINE LIST,1856-005,,,1856-09-08,Congress Hall,,Dollars,$,complete,2,119
6300,23086,,TROY HOUSE,DAILY MENU,,"[TROY, NY]",FOLDER,,INCLUDES WINE LIST,1856-001,,,1856-10-10,Troy House,,Dollars,$,complete,4,101


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
4597,20966,The Modern,The Modern,Dinner,RESTAURANT,"9 W 53rd St, New York, NY 10019","4"" by 14"" broadside",,Restaurant at MoMa run by Danny Meyer's Union ...,2008-0000,,,2008-03-03,The Modern,,Dollars,$,complete,6,29
9303,26756,,TANTE MARGUERITE,,"FOREIGN,RESTAURANT","Assemblee Nationale, 5, rue de Bourgogne",,,Les restaurant Bernard Loiseau; Prix-Fixe Menu...,Zander 674,,,2008-05-09,Tante Marguerite,,Euros,€,complete,4,25
9246,26698,,BENOIT,Dinner and Wine and Birthday,RESTAURANT,60 West 55th Street,3 broadsides,,Includes dinner and wine menu and birthday men...,Zander 676,,,2008-05-20,Benoit,,Dollars,$,complete,3,55
9258,26711,,Fiorello's,,RESTAURANT,Fiorello's,Folder; plastic sleeves with paper inserts; 8....,,Dieter Zander Collection; cover artwork entitl...,Zander 427 undated,,,2012-04-26,Fiorello's,,Dollars,$,complete,4,124
9310,26763,,Christer's,,,Christer's,Tri-fold; plastic sleeves with paper inserts; ...,,Dieter Zander Collection; wine list only.,Zander 527 undated,,,2012-04-26,Christer's,,Dollars,$,complete,4,114
9315,26768,,La Maree,Daily and wine,"FOREIGN,RESTAURANT",La Maree,Booklet; 12.75 x 19.75 inches,,"Dieter Zander Collection; illustrated, eps. ba...",Zander 614,,,2012-04-26,La Maree,,Francs,FF,complete,3,487
9200,26652,,Unknown,,,Unknown,Booklet; 8.75 x 8 inches; tied with a black cord,,Dieter Zander Collection; tasting menu; cover ...,Zander 555 undated,,,2012-04-26,Palio,,Dollars,$,complete,14,144
9313,26766,,La Coserie des Lilas,Daily,"FOREIGN,RESTAURANT",La Coserie des Lilas,Booklet; 11.5 x 16 inches; laminated cover,,Dieter Zander Collection; illustrated cover; s...,Zander 609,,,2012-04-27,La Closerie des Lilas,,Francs,FF,complete,4,207
9488,26945,,Wilder Mann Ruckersdorf,,"HOTEL,RESTAURANT",Wilder Mann Ruckersdorf,Tri-fold; 6.25 x 12.5 inches,,Dieter Zander Collection.,Zander 325 undated,,,2012-12-04,Wilder Mann Ruckersdorf,,Deutsche Marks,DM,complete,6,186
9009,26448,,Krogs Fiskerestaurant,,RESTAURANT,Krogs Fiskerestaurant,Folder; 8.25 x 11 inches,,Dieter Zander Collection; back is a map showin...,Zander 114 undated,,,2015-04-20,Krogs Fiskerestaurant,,Danish kroner,kr.,complete,4,104


In [9]:
# Save the intermediately cleaned dataset
df_cleaned.to_csv("./CleanedMenuStep1.csv", index=False)

## __Step 2__

- Currency Standardization, Price Outlier 


In [10]:
df = pd.read_csv("./CleanedMenuStep1.csv")

In [11]:
df = df[~((df['currency'].isna() | (df['currency'].str.strip() == '')) &
          (df['currency_symbol'].isna() | (df['currency_symbol'].str.strip() == '')))]

In [12]:
unique_currencies = df['currency'].unique()

In [13]:
df = df[~df['currency'].isin(['Cents', 'Pence'])]
unique_currencies_2 = df['currency'].unique()

In [14]:
unique_combinations = df[['currency', 'currency_symbol']].drop_duplicates()

In [15]:
currency_to_symbol = {
    'Dollars': 'USD',  # Generally USD for US Dollars, but can vary (e.g., CAD for Canadian Dollars)
    'Francs': 'FRF',  # ISO code for French Francs; Belgian Francs also used FRF, but ₣ is the historical symbol
    'Belgian Francs': 'BEF',
    'Shillings': 'SHP',  # ISO code for Saint Helena Pound (historically shillings)
    'Deutsche Marks': 'DEM',
    'UK Pounds': 'GBP',
    'Canadian Dollars': 'CAD',
    'Austro-Hungarian Kronen': 'HUF',  # ISO code for Hungarian Forint; Kronen does not have a modern ISO code
    'Swiss Francs': 'CHF',
    'Pesetas': 'ESP',  # ISO code for Spanish Peseta
    'Danish kroner': 'DKK',
    'Swedish kronor (SEK/kr)': 'SEK',
    'Yen': 'JPY',
    'Italian Lire': 'ITL',
    'Quetzales': 'GTQ',
    'Israeli lirot (1948-1980)': 'ILS',
    'Dutch Guilders': 'NLG',  # ISO code for Dutch Guilder
    'Austrian Schillings': 'ATS',  # ISO code for Austrian Schilling
    'Escudos': 'PTE',  # ISO code for Portuguese Escudo
    'Euros': 'EUR',
    'Bermudian dollars': 'BMD',
    'Hungarian forint': 'HUF',
    'Mexican pesos': 'MXN',
    'Drachmas': 'GRD',
    'New Taiwan Dollar': 'TWD',
    'Icelandic Krónur': 'ISK',
    'Australian Dollars': 'AUD',
    'Argentine peso': 'ARS',
    'Sol': 'PEN',
    'Uruguayan pesos': 'UYU',
    'Brazilian Cruzeiros': 'BRB',  # ISO code for Brazilian Cruzeiro
    'Złoty': 'PLN',
    'Norwegian kroner': 'NOK',
    'Cuban pesos': 'CUP',
    'Finnish markka': 'FIM',
    'Lats': 'LVL',  # ISO code for Latvian Lats
    'Straits dollar (1904-1939)': 'SGD'  # ISO code for Singapore Dollar (used as reference for historical dollar)
}

In [16]:
# Replace currency_symbol based on currency using the mapping dictionary
df['currency_symbol'] = df['currency'].map(currency_to_symbol).fillna(df['currency_symbol'])

In [17]:
unique_combinations_after_update = df[['currency', 'currency_symbol']].drop_duplicates()

In [18]:
df.to_csv('CleanedMenuStep2.csv', index=False)

## __Step 3__

- Event Standardization, Event Outlier


In [19]:
df = pd.read_csv("CleanedMenuStep2.csv")

In [20]:
# Preprocess the data: remove problematic characters and strip whitespace
df["occasion_cleaned"] = df["occasion"].str.replace(r'[^\w\s]', '', regex=True).str.strip().str.upper().fillna("")

In [21]:
# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
# Generate embeddings for the occasions
print("Generating embeddings...")
embeddings = model.encode(df["occasion_cleaned"].tolist(), show_progress_bar=True)

Generating embeddings...


Batches:   0%|          | 0/186 [00:00<?, ?it/s]

In [23]:
# Determine the number of clusters (categories) - arbitrarily defined but could do spaghetti analysis in the future
num_clusters = 20

In [24]:
# Apply KMeans clustering
print("Clustering embeddings...")
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(embeddings)

Clustering embeddings...


  super()._check_params_vs_input(X, default_n_init=10)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
# Add cluster labels to the DataFrame
df["category"] = kmeans.labels_

In [26]:
# Print the categories to manually label them
for i in range(num_clusters):
    print(f"Cluster {i}:")
    print(df[df["category"] == i]["occasion_cleaned"].tolist())
    print("\n")

Cluster 0:
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [27]:
# Manually determined mapping from cluster labels to category names
cluster_to_category = {
    0: "Anniversary",
    1: "Daily",
    2: "Complimentary",
    3: "Annual",
    4: "Farewell",
    5: "Tour",
    6: "Holiday",
    7: "Patriotic",
    8: "Rite",
    9: "Dinner",
    10: "Breakfast",
    11: "Social",
    12: "Meeting",
    13: "Religious Holiday",
    14: "Political",
    15: "Festival",
    16: "Reunion",
    17: "Reception",
    18: "Lunch",
    19: "Graduation",
}

In [28]:
# Map cluster labels to category names
df["category_name"] = df["category"].map(cluster_to_category)

In [29]:
# Remove empties
df.loc[df["occasion_cleaned"] == "", "category_name"] = ""

In [30]:
# Save the categorized DataFrame
df.to_csv("CleanedMenuStep3.csv", index=False)