<a href="https://colab.research.google.com/github/brendanpshea/data_clean_nypl/blob/main/New_York_Public_Library_Menus_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New York Public Library-Menus-EDA

Brendan's Notes:

The original data files are available here:

https://uofi.app.box.com/s/zh2hxfkq0cc6vyftw91nqa4smdpq7ybk

I just downloaded all the files (as a zip), and stuck them on dropbox (see below). From there, I just ran the standard Pandas commands to get an overview of the data.


In [None]:
# Get the files from BRendan's drobbox
!wget -q -nc "https://www.dropbox.com/scl/fi/l8b5np5xoes57nr1hqhae/NYPL-menus.zip?rlkey=pak0ox3wae0x0yd09d23eoqma&st=mfq9dka9&dl=1" -O NYPL-menus.zip

In [None]:
# Unzip them. They should be in the "NYPL-menus" directory
!unzip "NYPL-menus.zip"
!ls

Archive:  NYPL-menus.zip
  inflating: NYPL-menus/Menu.csv     
  inflating: NYPL-menus/MenuItem.csv  
  inflating: NYPL-menus/Dish.csv     
  inflating: NYPL-menus/MenuPage.csv  
NYPL-menus  NYPL-menus.zip  sample_data


## ERD Diagram
Here's an initial ERD (will need to double-checked for accuracy)

In [1]:
import base64
from IPython.display import Image, display, HTML

def mm(graph):
    graphbytes = graph.encode("utf8")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    display(Image(url="https://mermaid.ink/img/" + base64_string))


mm("""
erDiagram
    DISH {
        int id
        string name
        float description
        int menus_appeared
        int times_appeared
        int first_appeared
        int last_appeared
        float lowest_price
        float highest_price
    }

    MENU {
        int id
        string name
        string sponsor
        string event
        string venue
        string place
        string physical_description
        string occasion
        string notes
        string call_number
        float keywords
        float language
        string date
        string location
        float location_type
        string currency
        string currency_symbol
        string status
        int page_count
        int dish_count
    }

    MENUITEM {
        int id
        int menu_page_id
        float price
        float high_price
        float dish_id
        string created_at
        string updated_at
        float xpos
        float ypos
    }

    MENUPAGE {
        int id
        int menu_id
        string page_number
        string image_id
        string full_height
        string full_width
        string uuid
        string created_at
        string updated_at
    }

    DISH ||--o{ MENUITEM : "is described by"
    MENU ||--o{ MENUPAGE : "contains"
    MENUITEM }o--|| DISH : "refers to"
    MENUPAGE ||--o{ MENUITEM : "includes"
""")

### Explore Data Using Pandas

In [None]:
import pandas as pd
import numpy as np

dish_df =  pd.read_csv('NYPL-menus/Dish.csv')
menu_df = pd.read_csv('NYPL-menus/Menu.csv')
menuitem_df = pd.read_csv('NYPL-menus/MenuItem.csv')
menupage_df = pd.read_csv('NYPL-menus/MenuPage.csv')

In [None]:
dish_df.head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4
1,2,Chicken gumbo,,111,117,1895,1960,0.1,0.8
2,3,Tomato aux croutons,,13,13,1893,1917,0.25,0.4
3,4,Onion au gratin,,41,41,1900,1971,0.25,1.0
4,5,St. Emilion,,66,68,1881,1981,0.0,18.0


In [None]:
dish_df.shape

(423397, 9)

In [None]:
dish_df.describe().round(2)

Unnamed: 0,id,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
count,423397.0,0.0,423397.0,423397.0,423397.0,423397.0,394297.0,394297.0
mean,264456.59,,3.06,3.15,1675.51,1679.3,0.97,1.6
std,150489.07,,27.82,29.96,651.32,651.93,6.71,12.7
min,1.0,,0.0,-6.0,0.0,0.0,0.0,0.0
25%,132374.0,,1.0,1.0,1900.0,1900.0,0.0,0.0
50%,269636.0,,1.0,1.0,1914.0,1917.0,0.0,0.0
75%,397135.0,,1.0,1.0,1949.0,1955.0,0.4,0.6
max,515677.0,,7740.0,8484.0,2928.0,2928.0,1035.0,3050.0


In [None]:
dish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423397 entries, 0 to 423396
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              423397 non-null  int64  
 1   name            423397 non-null  object 
 2   description     0 non-null       float64
 3   menus_appeared  423397 non-null  int64  
 4   times_appeared  423397 non-null  int64  
 5   first_appeared  423397 non-null  int64  
 6   last_appeared   423397 non-null  int64  
 7   lowest_price    394297 non-null  float64
 8   highest_price   394297 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 29.1+ MB


In [None]:
menu_df.head()

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
0,12463,,HOTEL EASTMAN,BREAKFAST,COMMERCIAL,"HOT SPRINGS, AR",CARD; 4.75X7.5;,EASTER;,,1900-2822,,,1900-04-15,Hotel Eastman,,,,complete,2,67
1,12464,,REPUBLICAN HOUSE,[DINNER],COMMERCIAL,"MILWAUKEE, [WI];",CARD; ILLUS; COL; 7.0X9.0;,EASTER;,WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY ...,1900-2825,,,1900-04-15,Republican House,,,,under review,2,34
2,12465,,NORDDEUTSCHER LLOYD BREMEN,FRUHSTUCK/BREAKFAST;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP A...",1900-2827,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,84
3,12466,,NORDDEUTSCHER LLOYD BREMEN,LUNCH;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2828,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,63
4,12467,,NORDDEUTSCHER LLOYD BREMEN,DINNER;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,FOLDER; ILLU; COL; 5.5X7.5;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2829,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,4,33


In [None]:
menu_df.shape

(17545, 20)

In [None]:
menu_df.describe().round(2)

Unnamed: 0,id,keywords,language,location_type,page_count,dish_count
count,17545.0,0.0,0.0,0.0,17545.0,17545.0
mean,25325.95,,,,3.48,75.62
std,6431.55,,,,3.3,98.44
min,12463.0,,,,1.0,0.0
25%,20742.0,,,,2.0,20.0
50%,26165.0,,,,2.0,35.0
75%,30707.0,,,,4.0,93.0
max,35526.0,,,,74.0,4053.0


In [None]:
menu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17545 entries, 0 to 17544
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    17545 non-null  int64  
 1   name                  3197 non-null   object 
 2   sponsor               15984 non-null  object 
 3   event                 8154 non-null   object 
 4   venue                 8119 non-null   object 
 5   place                 8123 non-null   object 
 6   physical_description  14763 non-null  object 
 7   occasion              3791 non-null   object 
 8   notes                 10613 non-null  object 
 9   call_number           15983 non-null  object 
 10  keywords              0 non-null      float64
 11  language              0 non-null      float64
 12  date                  16959 non-null  object 
 13  location              17545 non-null  object 
 14  location_type         0 non-null      float64
 15  currency           

In [None]:
menuitem_df.head()

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
0,1,1389,0.4,,1.0,2011-03-28 15:00:44 UTC,2011-04-19 04:33:15 UTC,0.111429,0.254735
1,2,1389,0.6,,2.0,2011-03-28 15:01:13 UTC,2011-04-19 15:00:54 UTC,0.438571,0.254735
2,3,1389,0.4,,3.0,2011-03-28 15:01:40 UTC,2011-04-19 19:10:05 UTC,0.14,0.261922
3,4,1389,0.5,,4.0,2011-03-28 15:01:51 UTC,2011-04-19 19:07:01 UTC,0.377143,0.26272
4,5,3079,0.5,1.0,5.0,2011-03-28 15:21:26 UTC,2011-04-13 15:25:27 UTC,0.105714,0.313178


In [None]:
menuitem_df.shape

(1332726, 9)

In [None]:
menuitem_df.describe().round(2)

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,xpos,ypos
count,1332726.0,1332726.0,886810.0,91905.0,1332485.0,1332726.0,1332726.0
mean,697898.38,47594.87,12.84,8.11,158011.04,0.39,0.55
std,399980.67,22039.21,499.55,90.1,167762.04,0.22,0.22
min,1.0,130.0,0.0,0.0,1.0,0.0,0.0
25%,350251.25,32049.0,0.25,0.5,5089.0,0.18,0.37
50%,702410.5,53371.0,0.4,1.25,80700.0,0.38,0.57
75%,1045548.75,66823.0,1.0,3.0,332524.0,0.57,0.74
max,1385906.0,77425.0,180000.0,7800.0,515677.0,0.99,1.0


In [None]:
menuitem_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332726 entries, 0 to 1332725
Data columns (total 9 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   id            1332726 non-null  int64  
 1   menu_page_id  1332726 non-null  int64  
 2   price         886810 non-null   float64
 3   high_price    91905 non-null    float64
 4   dish_id       1332485 non-null  float64
 5   created_at    1332726 non-null  object 
 6   updated_at    1332726 non-null  object 
 7   xpos          1332726 non-null  float64
 8   ypos          1332726 non-null  float64
dtypes: float64(5), int64(2), object(2)
memory usage: 91.5+ MB
