# Imports

In [11]:
import os
import time
from typing import List, Tuple, Dict, Optional

import pandas as pd
import numpy as np
import sqlite3

CREATE TABLE Menu(
    id int,
    name text,
    place text,
    physical_description text,
    occasion text,
    notes text,
    call_number text,
    date date,
    location text,
    currency text,
    currency_symbol varchar(3),
    status varchar(12),
    page_count int,
    dish_count int,
    PRIMARY KEY(id)
);

CREATE TABLE MenuPage(
    id int,
    menu_id int,
    page_number int,
    image_id int,
    full_height int,
    full_width int,
    uuid varchar(37),
    PRIMARY KEY(id)
    FOREIGN KEY(menu_id) REFERENCES Menu(id)
);

CREATE TABLE Dish(
    id int,
    name text,
    menus_appeared int,
    times_appeared int,
    first_appeared int,
    last_appeared int,
    lowest_price float,
    highest_price float,
    PRIMARY KEY(id)
);

CREATE TABLE MenuItems(
    id int,
    menu_page_id int,
    price float,
    high_price float,
    dish_id int,
    created_at datetime,
    updated_at datetime,
    x_pos float,
    y_pos float,
    PRIMARY KEY(id)
    FOREIGN KEY(menu_page_id) REFERENCES MenuPage(id)
    FOREIGN KEY(dish_id) REFERENCES Dish(id)
);


# Load Data

In [12]:
_DISH_CSV_FILE = 'Dish.csv'
_MENU_CSV_FILE = 'Menu.csv'
_MENU_ITEM_CSV_FILE = 'MenuItem.csv'
_MENU_PAGE_CSV_FILE = 'MenuPage.csv'
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _DISH_CSV_FILE)
dish_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_CSV_FILE)
menu_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_ITEM_CSV_FILE)
mi_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_PAGE_CSV_FILE)
mp_df = pd.read_csv(path)

In [13]:
dish_df

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
0,1,Consomme printaniere royal,,8,8,1897,1927,0.20,0.4
1,2,Chicken gumbo,,111,117,1895,1960,0.10,0.8
2,3,Tomato aux croutons,,13,13,1893,1917,0.25,0.4
3,4,Onion au gratin,,41,41,1900,1971,0.25,1.0
4,5,St. Emilion,,66,68,1881,1981,0.00,18.0
...,...,...,...,...,...,...,...,...,...
423392,515673,Boiled: Corned beef & cabbage,,1,1,0,0,0.00,0.0
423393,515674,Boiled: Knuckle of Veal & Bacon,,1,1,0,0,0.00,0.0
423394,515675,Roast: Turkey & Cranberry Sauce,,1,1,0,0,0.00,0.0
423395,515676,"Claret: Chateau Larose, Cruse et Fils Freres",,1,1,0,0,0.00,0.0


In [14]:
menu_df

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
0,12463,,HOTEL EASTMAN,BREAKFAST,COMMERCIAL,"HOT SPRINGS, AR",CARD; 4.75X7.5;,EASTER;,,1900-2822,,,1900-04-15,Hotel Eastman,,,,complete,2,67
1,12464,,REPUBLICAN HOUSE,[DINNER],COMMERCIAL,"MILWAUKEE, [WI];",CARD; ILLUS; COL; 7.0X9.0;,EASTER;,WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY ...,1900-2825,,,1900-04-15,Republican House,,,,under review,2,34
2,12465,,NORDDEUTSCHER LLOYD BREMEN,FRUHSTUCK/BREAKFAST;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP A...",1900-2827,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,84
3,12466,,NORDDEUTSCHER LLOYD BREMEN,LUNCH;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2828,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,63
4,12467,,NORDDEUTSCHER LLOYD BREMEN,DINNER;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,FOLDER; ILLU; COL; 5.5X7.5;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2829,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,4,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17540,35515,Hotel LaSalle,Hotel LaSalle,,,,,,1 image,1913-0746_wotm,,,1913-09-24,Hotel LaSalle,,,,complete,1,22
17541,35516,Dennett's,Dennett's,,,,,,1 image,1913-0747_wotm,,,1913-09-24,Dennett's,,Dollars,$,complete,1,125
17542,35517,The Cortlandt,The Cortlandt,,,,,,1 image,1913-0748_wotm,,,1913-09-24,The Cortlandt,,Dollars,$,complete,1,101
17543,35518,Hotel Schynige Platte und Hotel Bellevue,Hotel Schynige Platte und Hotel Bellevue,,,,,,4 images,1913-0749_wotm,,,1913-09-24,Hotel Schynige Platte und Hotel Bellevue,,Swiss Francs,Fr,complete,4,161


In [15]:
mi_df

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
0,1,1389,0.4,,1.0,2011-03-28 15:00:44 UTC,2011-04-19 04:33:15 UTC,0.111429,0.254735
1,2,1389,0.6,,2.0,2011-03-28 15:01:13 UTC,2011-04-19 15:00:54 UTC,0.438571,0.254735
2,3,1389,0.4,,3.0,2011-03-28 15:01:40 UTC,2011-04-19 19:10:05 UTC,0.140000,0.261922
3,4,1389,0.5,,4.0,2011-03-28 15:01:51 UTC,2011-04-19 19:07:01 UTC,0.377143,0.262720
4,5,3079,0.5,1.0,5.0,2011-03-28 15:21:26 UTC,2011-04-13 15:25:27 UTC,0.105714,0.313178
...,...,...,...,...,...,...,...,...,...
1332721,1385902,52093,,,837.0,2017-06-26 21:35:08 UTC,2017-06-26 21:35:08 UTC,0.200000,0.753724
1332722,1385903,52093,,,1158.0,2017-06-26 21:35:20 UTC,2017-06-26 21:35:20 UTC,0.237333,0.753724
1332723,1385904,52093,,,375011.0,2017-06-26 21:35:34 UTC,2017-06-26 21:35:34 UTC,0.284000,0.759706
1332724,1385905,52093,,,373447.0,2017-06-26 21:35:46 UTC,2017-06-26 21:35:46 UTC,0.350667,0.759706


In [16]:
#mp_df['page_number'] = mp_df['page_number'].astype('int')
print(f"null pages: {mp_df['page_number'].isnull().sum()}")
mp_df

null pages: 1202


Unnamed: 0,id,menu_id,page_number,image_id,full_height,full_width,uuid
0,119,12460,1.0,1603595,7230.0,5428.0,510d47e4-2955-a3d9-e040-e00a18064a99
1,120,12460,2.0,1603596,5428.0,7230.0,510d47e4-2956-a3d9-e040-e00a18064a99
2,121,12460,3.0,1603597,7230.0,5428.0,510d47e4-2957-a3d9-e040-e00a18064a99
3,122,12460,4.0,1603598,7230.0,5428.0,510d47e4-2958-a3d9-e040-e00a18064a99
4,123,12461,1.0,1603591,7230.0,5428.0,510d47e4-2959-a3d9-e040-e00a18064a99
...,...,...,...,...,...,...,...
66932,77427,35526,6.0,5194216,2772.0,2341.0,cc70ae50-6212-0132-8b23-58d385a7bbd0
66933,77428,35526,7.0,5194217,2772.0,2301.0,cc844a40-6212-0132-6ead-58d385a7bbd0
66934,77429,35526,8.0,5194218,2763.0,2530.0,cc985890-6212-0132-9cd3-58d385a7bbd0
66935,77430,35526,1.0,5194219,5440.0,5237.0,e51c88c0-6212-0132-ef5e-58d385a7bbd0


# Joining

In [17]:
#all_items = pd.concat([mp_df.set_index('id'), mi_df.set_index('menu_page_id')], axis=1, join='inner')
all_pages = mp_df.merge(mi_df, left_on='id', right_on='menu_page_id', how='inner').reset_index(drop=True)
all_pages = all_pages.drop(labels=['id_x', 'id_y'], axis='columns')
all_pages

Unnamed: 0,menu_id,page_number,image_id,full_height,full_width,uuid,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
0,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,31.0,2011-03-31 20:24:46 UTC,2011-03-31 20:24:46 UTC,0.170000,0.285441
1,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,32.0,2011-03-31 20:26:14 UTC,2011-03-31 20:26:14 UTC,0.687143,0.311731
2,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,33.0,2011-03-31 20:26:32 UTC,2011-03-31 20:26:32 UTC,0.558571,0.341778
3,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,41.0,2011-04-01 23:35:50 UTC,2011-04-01 23:35:50 UTC,0.402857,0.266662
4,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,42.0,2011-04-01 23:36:02 UTC,2011-04-01 23:36:02 UTC,0.658571,0.261028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332721,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,108.0,2015-06-10 01:27:09 UTC,2015-06-10 01:27:09 UTC,0.493333,0.658192
1332722,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,115.0,2015-06-10 01:27:16 UTC,2015-06-10 01:27:16 UTC,0.306667,0.658192
1332723,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,7857.0,2015-06-10 01:27:25 UTC,2015-06-10 01:27:25 UTC,0.369333,0.676349
1332724,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,9089.0,2015-06-10 01:27:32 UTC,2015-06-10 01:27:32 UTC,0.613333,0.676349


In [18]:
all_menus = all_pages.merge(menu_df, left_on='menu_id', right_on='id', how='inner').reset_index(drop=True)
all_menus = all_menus.drop(labels=['id'], axis='columns')
all_menus

Unnamed: 0,menu_id,page_number,image_id,full_height,full_width,uuid,menu_page_id,price,high_price,dish_id,...,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
0,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,31.0,...,,,1900-04-15,Hotel Eastman,,,,complete,2,67
1,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,32.0,...,,,1900-04-15,Hotel Eastman,,,,complete,2,67
2,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,33.0,...,,,1900-04-15,Hotel Eastman,,,,complete,2,67
3,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,41.0,...,,,1900-04-15,Hotel Eastman,,,,complete,2,67
4,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,42.0,...,,,1900-04-15,Hotel Eastman,,,,complete,2,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327348,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,108.0,...,,,1920-12-25,"Christmas Dinner, Troop F 19: Fort Huachuca, A...",,,,complete,10,32
1327349,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,115.0,...,,,1920-12-25,"Christmas Dinner, Troop F 19: Fort Huachuca, A...",,,,complete,10,32
1327350,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,7857.0,...,,,1920-12-25,"Christmas Dinner, Troop F 19: Fort Huachuca, A...",,,,complete,10,32
1327351,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,9089.0,...,,,1920-12-25,"Christmas Dinner, Troop F 19: Fort Huachuca, A...",,,,complete,10,32


In [19]:
all_items = all_menus.merge(dish_df, left_on='dish_id', right_on='id', how='inner')
all_items = all_items.drop(labels=['id'], axis='columns')
print(all_items.columns)
all_items

Index(['menu_id', 'page_number', 'image_id', 'full_height', 'full_width',
       'uuid', 'menu_page_id', 'price', 'high_price', 'dish_id', 'created_at',
       'updated_at', 'xpos', 'ypos', 'name_x', 'sponsor', 'event', 'venue',
       'place', 'physical_description', 'occasion', 'notes', 'call_number',
       'keywords', 'language', 'date', 'location', 'location_type', 'currency',
       'currency_symbol', 'status', 'page_count', 'dish_count', 'name_y',
       'description', 'menus_appeared', 'times_appeared', 'first_appeared',
       'last_appeared', 'lowest_price', 'highest_price'],
      dtype='object')


Unnamed: 0,menu_id,page_number,image_id,full_height,full_width,uuid,menu_page_id,price,high_price,dish_id,...,page_count,dish_count,name_y,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
0,12463,1.0,466928,3049.0,2004.0,510D47DB-491F-A3D9-E040-E00A18064A99,130,,,31.0,...,2,67,Sliced Bananas,,220,238,1900,1987,0.0,15.0
1,12534,1.0,466586,3215.0,2244.0,510d47db-4604-a3d9-e040-e00a18064a99,364,0.15,,31.0,...,2,52,Sliced Bananas,,220,238,1900,1987,0.0,15.0
2,12545,3.0,4000008942,4331.0,1768.0,510d47db-47b7-a3d9-e040-e00a18064a99,404,0.15,,31.0,...,4,113,Sliced Bananas,,220,238,1900,1987,0.0,15.0
3,12583,1.0,466928,3049.0,2004.0,510d47db-491f-a3d9-e040-e00a18064a99,517,,,31.0,...,2,67,Sliced Bananas,,220,238,1900,1987,0.0,15.0
4,12654,1.0,466586,3215.0,2244.0,510d47db-4604-a3d9-e040-e00a18064a99,751,0.15,,31.0,...,2,50,Sliced Bananas,,220,238,1900,1987,0.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327104,35518,3.0,5189498,4685.0,2702.0,e01ca880-517f-0132-137c-58d385a7b928,77373,1.00,,498473.0,...,4,161,Compôte gemischt,,1,1,1913,1913,1.0,1.0
1327105,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,512191.0,...,10,32,Potato Custard,,1,1,1920,1920,,
1327106,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,512192.0,...,10,32,Applie Pie,,1,1,1920,1920,,
1327107,35526,4.0,5194214,2747.0,2338.0,cc4aa730-6212-0132-2091-58d385a7bbd0,77425,,,512193.0,...,10,32,Minced Fruit with Cream,,1,1,1920,1920,,


# Load Data SQL

In [21]:
create_menu_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "nypl_menu_step_2.sql")
create_menu_page_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuPage.sql")
create_menu_item_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuItem_cleaned.sql")
create_dish_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "filtered_dish.sql")
menu_db_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', "menudb.db")
con = sqlite3.connect(menu_db_path)
cur = con.cursor()

# Load our exported OpenRefine SQL script.
with open(create_menu_path) as f:
    db_menu_create = f.read()
    
with open(create_menu_page_path) as f:
    db_menu_page_create = f.read()
    
with open(create_menu_item_path) as f:
    db_menu_item_create = f.read()
    
with open(create_dish_path) as f:
    db_dish_create = f.read()

# Create the database using our creation script.
res = cur.executescript(db_menu_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_menu_page_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_menu_item_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_dish_create)
con.commit()


def MaybeDropColumnsMenu(con, cur):
    cols_to_drop = ['name','notes', 'occasion', 'place', 'keywords', 'page_count', 'dish_count',]
    template = lambda col: f'ALTER TABLE nypl_menu_step_2_sql_export_demo DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()
    
def MaybeDropColumnsMenuItem(con, cur):
    cols_to_drop = ['created_at', 'updated_at', 'xpos', 'ypos']
    template = lambda col: f'ALTER TABLE menuItem DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()

def MaybeDropColumnsMenuPage(con, cur):
    cols_to_drop = ['page_number', 'image_id', 'full_height', 'full_width','uuid']
    template = lambda col: f'ALTER TABLE menuPage DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()

def MaybeDropColumnsDish(con, cur):
    cols_to_drop = ['menus_appeared', 'times_appeared', 'first_appeared', 'last_appeared']
    template = lambda col: f'ALTER TABLE Dish DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()

[f(con, cur) for f in [MaybeDropColumnsMenu, MaybeDropColumnsMenuItem, MaybeDropColumnsMenuPage, MaybeDropColumnsDish]]

[None, None, None, None]

In [22]:
# We'll have to join all the tables later, let's do this once in a temp table.
def CreateJoinedTable(con, cur):
    q = """
        DROP TABLE IF EXISTS MenuJoined;
        CREATE TEMP TABLE MenuJoined AS
        SELECT
            Menu.id AS MenuId,
            menuPage.id AS MenuPageId,
            menuItem.id AS MenuItemId,
            Dish.id AS DishId,
            *
        FROM
          nypl_menu_step_2_sql_export_demo AS Menu INNER JOIN
          menuPage ON Menu.id = menuPage.id INNER JOIN
          menuItem ON menuPage.id = menuItem.menu_page_id INNER JOIN
          Dish ON Dish.id = menuItem.dish_id
        ;
    """
    res = cur.executescript(q)
    con.commit()
    
CreateJoinedTable(con, cur)

## List Schema and Examples

In [23]:


def get_table_schema(table_name: str):
    """Return the schema of a table
    
   Columns in the result set include:
       "name" (its name);
       "type" (data type if given, else '');
       "notnull" (whether or not the column can be NULL);
       "dflt_value" (the default value for the column); and
       "pk" (either zero for columns that are not part of the primary key, or the 1-based index of the column within the primary key).
   """
    res = cur.execute(f"PRAGMA table_info({table_name});")
    return res.fetchall()
    
def print_table_schema(table_name: str, schema: Tuple[str]):
    """Print the schema of a table
      
      Columns in the result set include:
          "name" (its name);
          "type" (data type if given, else '');
          "notnull" (whether or not the column can be NULL);
          "dflt_value" (the default value for the column); and
          "pk" (either zero for columns that are not part of the primary key, or the 1-based index of the column within the primary key).
    """
    print(f'Schema for `{table_name}`')
    for col in schema:
        print(f'{col[1]:<14}, type:{col[2]:<12} notnull:{col[3]}, dflt:{col[4]}, pk:{col[5]}')
    print('\n')
    
    
def print_query(query: str, limit: int = 10):
    """Run a query and print the first 10 results."""
    res = cur.execute(query)
    res = res.fetchall()
    for i, r in enumerate(res[:min(limit,len(res))]):
        print(f'{i}: {r}')
        

# Fetch some sample results.
print_table_schema('menuItem', get_table_schema('menuItem'))
print_query("SELECT * FROM menuItem LIMIT 10")

# Fetch some sample results.
print_table_schema('nypl_menu_step_2_sql_export_demo', get_table_schema('nypl_menu_step_2_sql_export_demo'))
print_query("SELECT * FROM nypl_menu_step_2_sql_export_demo LIMIT 10")
    
# Fetch some sample results.
print_table_schema('menuPage', get_table_schema('menuPage'))
print_query("SELECT * FROM menuPage LIMIT 10")

# Fetch some sample results.
print_table_schema('Dish', get_table_schema('Dish'))
print_query("SELECT * FROM Dish LIMIT 10")


print_table_schema('MenuJoined', get_table_schema('MenuJoined'))
print_query("SELECT * FROM MenuJoined LIMIT 10")
    
# Example join.
res = cur.execute("SELECT * FROM menuPage INNER JOIN menuItem on menuPage.id = menuItem.menu_page_id LIMIT 10")
for i, r in enumerate(res.fetchall()[:10]):
    print(f'{i}: {r}')
    

Schema for `menuItem`
id            , type:INT          notnull:1, dflt:None, pk:0
menu_page_id  , type:INT          notnull:0, dflt:None, pk:0
price         , type:NUMERIC      notnull:0, dflt:None, pk:0
high_price    , type:NUMERIC      notnull:0, dflt:None, pk:0
dish_id       , type:INT          notnull:0, dflt:None, pk:0


0: (1, 1389, 0.4, None, 1)
1: (2, 1389, 0.6, None, 2)
2: (3, 1389, 0.4, None, 3)
3: (4, 1389, 0.5, None, 4)
4: (5, 3079, 0.5, 1, 5)
5: (6, 1389, 0.1, None, 7)
6: (8, 1389, 0.25, None, 9)
7: (9, 1389, 0.75, None, 10)
8: (10, 1389, 0.75, None, 11)
9: (11, 1389, 0.6, None, 8)
Schema for `nypl_menu_step_2_sql_export_demo`
id            , type:INT          notnull:1, dflt:None, pk:0
sponsor       , type:VARCHAR(255) notnull:0, dflt:None, pk:0
event         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
venue         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
location      , type:VARCHAR(255) notnull:0, dflt:None, pk:0


0: (12463, 'hotel', 'breakfast', 'commercial

# Use Case Queries

In [24]:
target_foods = ['eggs', 'coffee', 'steak', 'apple pie']

all_sponsors = [
    'club',  # Generally private social/dinner clubs.
    'hotel',
    'rail',
    'restaurant',
    'ship',
    '',  # blank
]
all_events = [
    'anniversary',  # Generally private event
    'annual event',  # Generally private event
    'banquet',  # Generally private event
    'breakfast',
    'brunch',
    'daily menu',
    'diner',
    'dinner',
    'lunch',
    'private',
    'supper',
    'wine',
    '',  # (blank)
]
all_venues = [
    'airline',
    'club',  # Generally private social/dinner clubs.
    'commercial',
    'edu',  # Generally private event
    'government',  # Generally private event
    'hotel',
    'military',  # Generally private event
    'null',
    'patriotic',  # Generally private event
    'private',
    'professional',  # Generally private event
    'railroad',
    'religious',  # Generally private event
    'restaurant',
    'royal',  # Generally private event
    'ship',
    'social',  # Often a banquet, ball, anniversary, annual meeting, or prof event.
    '',  # (blank)
]
    
public_sponsors = ['hotel', 'rail', 'restaurant', 'ship', '']
public_events = ['breakfast', 'brunch', 'daily menu', 'diner', 'dinner', 'lunch', 'supper', 'wine', '']
public_venues = [ 'airline', 'commercial', 'hotel', 'railroad', 'restaurant', 'ship', '']
quote = lambda x: '\'' + x + '\''
public_sponsors_as_sql_in = ''.join(['(', ','.join([quote(x) for x in public_sponsors]), ')'])
public_events_as_sql_in =  ''.join(['(', ','.join([quote(x) for x in public_events]), ')'])
public_venues_as_sql_in =  ''.join(['(', ','.join([quote(x) for x in public_venues]), ')'])

In [28]:
# u1_query = """
# WITH AllVenues AS (
#     SELECT *
#     FROM
#       nypl_menu_step_2_sql_export_demo AS Menu INNER JOIN
#       menuPage ON Menu.id = menuPage.id INNER JOIN
#       menuItem ON menuPage.id = menuItem.menu_page_id INNER JOIN
#       Dish ON Dish.id = menuItem.dish_id
#     WHERE
#       Dish.name IS NOT NULL AND
#       Dish.name LIKE '%{0}%'
#     LIMIT 200
# ),
# AllPublicVenus AS (
#     SELECT *
#     FROM
#         AllVenues
#     WHERE
#         AllVenues.sponsor IN {1} AND
#         AllVenues.event IN {2} AND
#         AllVenues.venue IN {3}
# )

# ;
# """
u1_query = """
WITH AllVenues AS (
    SELECT *
    FROM
      MenuJoined
    WHERE
      name IS NOT NULL AND
      name LIKE '%{0}%'
    LIMIT 1000
),
AllPublicVenues AS (
    SELECT *
    FROM
        AllVenues
    WHERE
        AllVenues.sponsor IN {1} OR
        AllVenues.event IN {2} OR
        AllVenues.venue IN {3}
),
-- Now we focus on price data. Condense the 
PriceData AS (
    SELECT
        IIF(price IS NOT NULL, price, 0) AS price, 
        IIF(high_price IS NOT NULL, high_price, 0) AS high_price,
        IIF(lowest_price IS NOT NULL, lowest_price, 0) AS lowest_price, 
        IIF(highest_price IS NOT NULL, highest_price, 0) AS highest_price
    FROM
        AllPublicVenues
),
AveragePriceData AS (
    SELECT
        ROUND(AVG(price), 2) AS avg_price,
        ROUND(AVG(high_price), 2) AS avg_high,
        ROUND(AVG(lowest_price), 2) AS avg_lowest,
        ROUND(AVG(highest_price), 2) AS avg_highest
    FROM
        PriceData
),
Results AS (
    SELECT
        *
    FROM
        AllPublicVenues CROSS JOIN
        AveragePriceData
)
SELECT
    name, sponsor, event, venue, avg_price, avg_high, avg_lowest, avg_highest
FROM
    Results
;
"""
u1_query_fmt = u1_query.format(target_foods[0], public_sponsors_as_sql_in, public_events_as_sql_in, public_venues_as_sql_in)

res = cur.execute(u1_query_fmt)
for i, r in enumerate(res.fetchall()[:10]):
    print(f'{i}: {r}')

0: ('eggs poached on toast', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
1: ('eggs a la russienne', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
2: ('scrambled eggs with sardelles', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
3: ('boiled eggs', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
4: ('eggs en cocotte', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
5: ('scrambled eggs', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79)
6: ('eggs poached on toast', None, 'dinner', 'social', 0.34, 0.0, 0.11, 8.79)
7: ('light boiled eggs', None, 'dinner', 'social', 0.34, 0.0, 0.11, 8.79)
8: ('hard boiled eggs', None, 'dinner', 'social', 0.34, 0.0, 0.11, 8.79)
9: ('fried eggs', None, 'dinner', 'social', 0.34, 0.0, 0.11, 8.79)


## Validation Queries

Please put Step \#5 here.

# Filtering

In [26]:
# The following sections enable individual filtering steps.
FILTER_RM_TITLES_WITHOUT_YEARS = True
MUTATION_ADD_YEAR_COL = True

# All filters/mutations will be applied to the fdf (filtered data frame). The original will be preserved.
fdf = df.copy()

NameError: name 'df' is not defined

In [None]:
titles = fdf['title']
titles[:100]

In [None]:
# Find titles with the year in them
total_titles = fdf['title'].size
null_titles = fdf.title.isnull().sum()
title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
num_titles_with_year = title_with_year.sum()
pct_with_year = num_titles_with_year / (total_titles - null_titles)
print(f'total: {total_titles}, nulls: {null_titles},  '
      f'num_with_year: {num_titles_with_year}, % w/yr: {pct_with_year:0.4}%')

In [None]:
if FILTER_RM_TITLES_WITHOUT_YEARS:
    fdf = fdf[fdf['title'].notna()]
    title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
    fdf = fdf[title_with_year]
    
if MUTATION_ADD_YEAR_COL:
    if FILTER_RM_TITLES_WITHOUT_YEARS:
        fdf['year'] = fdf.title.str.extract(pat='([012]\d{2,3})', expand=False)
        #fdf['year'] = pd.to_datetime(fdf['year'], format='%y', errors='raise')
        fdf['year'] = fdf['year'].astype('int32')