# Imports

In [None]:
import os
import time
from typing import List, Tuple, Dict, Optional

import pandas as pd
import numpy as np
import sqlite3

CREATE TABLE Menu(
    id int,
    name text,
    place text,
    physical_description text,
    occasion text,
    notes text,
    call_number text,
    date date,
    location text,
    currency text,
    currency_symbol varchar(3),
    status varchar(12),
    page_count int,
    dish_count int,
    PRIMARY KEY(id)
);

CREATE TABLE MenuPage(
    id int,
    menu_id int,
    page_number int,
    image_id int,
    full_height int,
    full_width int,
    uuid varchar(37),
    PRIMARY KEY(id)
    FOREIGN KEY(menu_id) REFERENCES Menu(id)
);

CREATE TABLE Dish(
    id int,
    name text,
    menus_appeared int,
    times_appeared int,
    first_appeared int,
    last_appeared int,
    lowest_price float,
    highest_price float,
    PRIMARY KEY(id)
);

CREATE TABLE MenuItems(
    id int,
    menu_page_id int,
    price float,
    high_price float,
    dish_id int,
    created_at datetime,
    updated_at datetime,
    x_pos float,
    y_pos float,
    PRIMARY KEY(id)
    FOREIGN KEY(menu_page_id) REFERENCES MenuPage(id)
    FOREIGN KEY(dish_id) REFERENCES Dish(id)
);


# Load Data

In [None]:
_DISH_CSV_FILE = 'Dish.csv'
_MENU_CSV_FILE = 'Menu.csv'
_MENU_ITEM_CSV_FILE = 'MenuItem.csv'
_MENU_PAGE_CSV_FILE = 'MenuPage.csv'
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _DISH_CSV_FILE)
dish_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_CSV_FILE)
menu_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_ITEM_CSV_FILE)
mi_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_PAGE_CSV_FILE)
mp_df = pd.read_csv(path)

In [None]:
dish_df

In [None]:
menu_df

In [None]:
mi_df

In [None]:
#mp_df['page_number'] = mp_df['page_number'].astype('int')
print(f"null pages: {mp_df['page_number'].isnull().sum()}")
mp_df

# Joining

In [3]:
#all_items = pd.concat([mp_df.set_index('id'), mi_df.set_index('menu_page_id')], axis=1, join='inner')
all_pages = mp_df.merge(mi_df, left_on='id', right_on='menu_page_id', how='inner').reset_index(drop=True)
all_pages = all_pages.drop(labels=['id_x', 'id_y'], axis='columns')
# all_pages

In [4]:
all_menus = all_pages.merge(menu_df, left_on='menu_id', right_on='id', how='inner').reset_index(drop=True)
all_menus = all_menus.drop(labels=['id'], axis='columns')
# all_menus

In [5]:
all_items = all_menus.merge(dish_df, left_on='dish_id', right_on='id', how='inner')
all_items = all_items.drop(labels=['id'], axis='columns')
# print(all_items.columns)
# all_items

# Load Data SQL

## Cleaned Data

In [10]:
create_menu_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "nypl_menu_step_2.sql")
create_menu_page_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuPage.sql")
create_menu_item_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuItem_cleaned.sql")
# create_dish_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "filtered_dish.sql")
create_dish_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "filtered_dish_2.sql")
menu_db_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', "menudb.db")
con = sqlite3.connect(menu_db_path)
cur = con.cursor()

# Load our exported OpenRefine SQL script.
with open(create_menu_path) as f:
    db_menu_create = f.read()
    
with open(create_menu_page_path) as f:
    db_menu_page_create = f.read()
    
with open(create_menu_item_path) as f:
    db_menu_item_create = f.read()
    
with open(create_dish_path) as f:
    db_dish_create = f.read()

# Create the database using our creation script.
res = cur.executescript(db_menu_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_menu_page_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_menu_item_create)
con.commit()

# Create the database using our creation script.
res = cur.executescript(db_dish_create)
con.commit()


def MaybeDropColumnsMenu(con, cur, tbl_name = 'nypl_menu_step_2_sql_export_demo'):
    cols_to_drop = ['name','notes', 'occasion', 'place', 'keywords', 'page_count', 'dish_count',]
    template = lambda col: f'ALTER TABLE {tbl_name} DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()
    
def MaybeDropColumnsMenuItem(con, cur, tbl_name = 'menuItem'):
    cols_to_drop = ['created_at', 'updated_at', 'xpos', 'ypos']
    template = lambda col: f'ALTER TABLE {tbl_name} DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()

def MaybeDropColumnsMenuPage(con, cur, tbl_name = 'menuPage'):
    cols_to_drop = ['page_number', 'image_id', 'full_height', 'full_width','uuid']
    template = lambda col: f'ALTER TABLE {tbl_name} DROP COLUMN {col};'
    templates = [template(col) for col in cols_to_drop]
    for t in templates:
        res = cur.executescript(t)
    con.commit()

def MaybeDropColumnsDish(con, cur, tbl_name = 'Dish'):
    return
    # cols_to_drop = []
    # template = lambda col: f'ALTER TABLE {tbl_name} DROP COLUMN {col};'
    # templates = [template(col) for col in cols_to_drop]
    # for t in templates:
    #     res = cur.executescript(t)
    # con.commit()

[f(con, cur) for f in [MaybeDropColumnsMenu, MaybeDropColumnsMenuItem, MaybeDropColumnsMenuPage, MaybeDropColumnsDish]]

[None, None, None, None]

In [None]:
# Row null/blank deletions will go here.

def MaybeDropRowMenu(con, cur, tbl_name: str = 'nypl_menu_step_2_sql_export_demo'):
    script = '''
        DELETE FROM {0}
        WHERE
            sponsor = '' AND
            event = '' AND
            venue = '' AND
            location = ''
    '''
    res = cur.executescript(string.format(script, tbl_name))
    con.commit()
    

# Not needed. Done in csv and updated sql
def MaybeDropRowDish(con, cur, tbl_name: str = 'dish'):
    # First appeared, last_appeared.
    # Menu appeared, times_appeared.
    # Ahmad need to add a SORT BY to final results that ranks the result by the Menu Appeared, and by the First appeared.
    pass

In [15]:
# We'll have to join all the tables later, let's do this once in a temp table.
def CreateJoinedTable(con, cur):
    q = """
        DROP TABLE IF EXISTS MenuJoined;
        CREATE TEMP TABLE MenuJoined AS
        SELECT
            Menu.id AS MenuId,
            menuPage.id AS MenuPageId,
            menuItem.id AS MenuItemId,
            Dish.id AS DishId,
            *
        FROM
          nypl_menu_step_2_sql_export_demo AS Menu INNER JOIN
          menuPage ON Menu.id = menuPage.id INNER JOIN
          menuItem ON menuPage.id = menuItem.menu_page_id INNER JOIN
          Dish ON Dish.id = menuItem.dish_id
        ;
    """
    res = cur.executescript(q)
    con.commit()
    
CreateJoinedTable(con, cur)

## Uncleaned Data

In [14]:
create_uncleaned_menu_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menu_uncleaned.sql")
create_uncleaned_menu_page_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuPage_uncleaned.sql")
create_uncleaned_menu_item_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "menuItem_uncleaned.sql")
create_uncleaned_dish_path = os.path.join(os.path.dirname(os.getcwd()), 'data', "dish_uncleaned.sql")
uncleaned_menu_db_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', "uncleaned_menudb.db")
ucon = sqlite3.connect(uncleaned_menu_db_path)
ucur = ucon.cursor()

# Load our exported OpenRefine SQL script.
with open(create_uncleaned_menu_path) as f:
    db_menu_create = f.read()
    
with open(create_uncleaned_menu_page_path) as f:
    db_menu_page_create = f.read()
    
with open(create_uncleaned_menu_item_path) as f:
    db_menu_item_create = f.read()
    
with open(create_uncleaned_dish_path) as f:
    db_dish_create = f.read()

# Create the database using our creation script.
ures = ucur.executescript(db_menu_create)
ucon.commit()

# Create the database using our creation script.
ures = ucur.executescript(db_menu_page_create)
ucon.commit()

# Create the database using our creation script.
ures = ucur.executescript(db_menu_item_create)
ucon.commit()

# Create the database using our creation script.
ures = ucur.executescript(db_dish_create)
ucon.commit()


In [16]:
zipped = zip([MaybeDropColumnsMenu, MaybeDropColumnsMenuItem, MaybeDropColumnsMenuPage, MaybeDropColumnsDish],
             ['menu_uncleaned', 'menu_item_uncleaned', 'menu_page_uncleaned', 'dish_uncleaned'])
[f(ucon, ucur, tbl) for f, tbl in zipped]

[None, None, None, None]

In [17]:
# We'll have to join all the tables later, let's do this once in a temp table.
def CreateJoinedTableUnclean(con, cur):
    q = """
        DROP TABLE IF EXISTS MenuJoined;
        CREATE TEMP TABLE MenuJoined AS
        SELECT
            Menu.id AS MenuId,
            menuPage.id AS MenuPageId,
            menuItem.id AS MenuItemId,
            Dish.id AS DishId,
            *
        FROM
          menu_uncleaned AS Menu INNER JOIN
          menu_page_uncleaned AS menuPage ON Menu.id = menuPage.id INNER JOIN
          menu_item_uncleaned AS menuItem ON menuPage.id = menuItem.menu_page_id INNER JOIN
          dish_uncleaned AS Dish ON Dish.id = menuItem.dish_id
        ;
    """
    res = cur.executescript(q)
    con.commit()
    
CreateJoinedTableUnclean(ucon, ucur)

## List Clean Schema and Examples

In [18]:
def get_table_schema(cur, table_name: str):
    """Return the schema of a table
    
   Columns in the result set include:
       "name" (its name);
       "type" (data type if given, else '');
       "notnull" (whether or not the column can be NULL);
       "dflt_value" (the default value for the column); and
       "pk" (either zero for columns that are not part of the primary key, or the 1-based index of the column within the primary key).
   """
    res = cur.execute(f"PRAGMA table_info({table_name});")
    return res.fetchall()
    
def print_table_schema(cur, table_name: str, schema: Tuple[str]):
    """Print the schema of a table
      
      Columns in the result set include:
          "name" (its name);
          "type" (data type if given, else '');
          "notnull" (whether or not the column can be NULL);
          "dflt_value" (the default value for the column); and
          "pk" (either zero for columns that are not part of the primary key, or the 1-based index of the column within the primary key).
    """
    print(f'Schema for `{table_name}`')
    for col in schema:
        print(f'{col[1]:<14}, type:{col[2]:<12} notnull:{col[3]}, dflt:{col[4]}, pk:{col[5]}')
    print('\n')
    
def query_as_list(cur, query: str, limit: int = 10):
    """Run a query and return the first limit results."""
    res = cur.execute(query)
    res = res.fetchall()
    return res[:min(limit,len(res))]
    
def print_query(cur, query: str, limit: int = 10):
    """Run a query and print the first 10 results."""
    for i, r in enumerate(query_as_list(cur, query, limit)):
        print(f'{i}: {r}')
        

# Fetch some sample results.
print_table_schema(cur, 'menuItem', get_table_schema(cur, 'menuItem'))
print_query(cur, "SELECT * FROM menuItem LIMIT 10")

# Fetch some sample results.
print_table_schema(cur, 'nypl_menu_step_2_sql_export_demo', get_table_schema(cur, 'nypl_menu_step_2_sql_export_demo'))
print_query(cur, "SELECT * FROM nypl_menu_step_2_sql_export_demo LIMIT 10")
    
# Fetch some sample results.
print_table_schema(cur, 'menuPage', get_table_schema(cur, 'menuPage'))
print_query(cur, "SELECT * FROM menuPage LIMIT 10")

# Fetch some sample results.
print_table_schema(cur, 'Dish', get_table_schema(cur, 'Dish'))
print_query(cur, "SELECT * FROM Dish LIMIT 10")


print_table_schema(cur, 'MenuJoined', get_table_schema(cur, 'MenuJoined'))
print_query(cur, "SELECT * FROM MenuJoined LIMIT 10")
    
# Example join.
res = cur.execute("SELECT * FROM menuPage INNER JOIN menuItem on menuPage.id = menuItem.menu_page_id LIMIT 10")
for i, r in enumerate(res.fetchall()[:10]):
    print(f'{i}: {r}')
    

Schema for `menuItem`
id            , type:INT          notnull:1, dflt:None, pk:0
menu_page_id  , type:INT          notnull:0, dflt:None, pk:0
price         , type:NUMERIC      notnull:0, dflt:None, pk:0
high_price    , type:NUMERIC      notnull:0, dflt:None, pk:0
dish_id       , type:INT          notnull:0, dflt:None, pk:0


0: (1, 1389, 0.4, None, 1)
1: (2, 1389, 0.6, None, 2)
2: (3, 1389, 0.4, None, 3)
3: (4, 1389, 0.5, None, 4)
4: (5, 3079, 0.5, 1, 5)
5: (6, 1389, 0.1, None, 7)
6: (8, 1389, 0.25, None, 9)
7: (9, 1389, 0.75, None, 10)
8: (10, 1389, 0.75, None, 11)
9: (11, 1389, 0.6, None, 8)
Schema for `nypl_menu_step_2_sql_export_demo`
id            , type:INT          notnull:1, dflt:None, pk:0
sponsor       , type:VARCHAR(255) notnull:0, dflt:None, pk:0
event         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
venue         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
location      , type:VARCHAR(255) notnull:0, dflt:None, pk:0


0: (12463, 'hotel', 'breakfast', 'commercial

## List Unclean Schema and Examples

In [19]:
# Fetch some sample results.
print_table_schema(ucur, 'menu_item_uncleaned', get_table_schema(ucur, 'menu_item_uncleaned'))
print_query(ucur, "SELECT * FROM menu_item_uncleaned LIMIT 10")

# Fetch some sample results.
print_table_schema(ucur, 'menu_uncleaned', get_table_schema(ucur, 'menu_uncleaned'))
print_query(ucur, "SELECT * FROM menu_uncleaned LIMIT 10")
print_query(ucur, "SELECT * FROM menu_uncleaned WHERE sponsor = 'ARCHITECTURAL LEAGUE OF NEW YORK' LIMIT 10")
    
# Fetch some sample results.
print_table_schema(ucur, 'menu_page_uncleaned', get_table_schema(ucur, 'menu_page_uncleaned'))
print_query(ucur, "SELECT * FROM menu_page_uncleaned LIMIT 10")

# Fetch some sample results.
print_table_schema(ucur, 'dish_uncleaned', get_table_schema(ucur, 'dish_uncleaned'))
print_query(ucur, "SELECT * FROM dish_uncleaned LIMIT 10")


print_table_schema(ucur, 'MenuJoined', get_table_schema(ucur, 'MenuJoined'))
print_query(ucur, "SELECT * FROM MenuJoined LIMIT 10")
print("\nArch League")
print_query(ucur, "SELECT * FROM MenuJoined WHERE sponsor = 'ARCHITECTURAL LEAGUE OF NEW YORK' LIMIT 10")
print("\nBy Menu ID")
print_query(ucur, "SELECT * FROM menu_page_uncleaned WHERE menu_id IN (22770, 25288, 21689, 21744) GROUP BY menu_id LIMIT 10")
print("\nBy MenuPage ID")
print_query(ucur, "SELECT * FROM MenuJoined WHERE MenuPageId = 26866")

Schema for `menu_item_uncleaned`
id            , type:VARCHAR(255) notnull:0, dflt:None, pk:0
menu_page_id  , type:INT          notnull:0, dflt:None, pk:0
price         , type:NUMERIC      notnull:0, dflt:None, pk:0
high_price    , type:NUMERIC      notnull:0, dflt:None, pk:0
dish_id       , type:INT          notnull:0, dflt:None, pk:0


0: ('1', 1389, 0.4, None, 1)
1: ('2', 1389, 0.6, None, 2)
2: ('3', 1389, 0.4, None, 3)
3: ('4', 1389, 0.5, None, 4)
4: ('5', 3079, 0.5, 1, 5)
5: ('6', 1389, 0.1, None, 7)
6: ('8', 1389, 0.25, None, 9)
7: ('9', 1389, 0.75, None, 10)
8: ('10', 1389, 0.75, None, 11)
9: ('11', 1389, 0.6, None, 8)
Schema for `menu_uncleaned`
id            , type:INT          notnull:0, dflt:None, pk:0
sponsor       , type:VARCHAR(255) notnull:0, dflt:None, pk:0
event         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
venue         , type:VARCHAR(255) notnull:0, dflt:None, pk:0
physical_description, type:VARCHAR(255) notnull:0, dflt:None, pk:0
call_number   , type:VARCHA

# Use Case Queries

In [20]:
target_foods = ['eggs', 'coffee', 'steak', 'apple pie']

all_sponsors = [
    'club',  # Generally private social/dinner clubs.
    'hotel',
    'rail',
    'restaurant',
    'ship',
    '',  # blank
]
all_events = [
    'anniversary',  # Generally private event
    'annual event',  # Generally private event
    'banquet',  # Generally private event
    'breakfast',
    'brunch',
    'daily menu',
    'diner',
    'dinner',
    'lunch',
    'private',
    'supper',
    'wine',
    '',  # (blank)
]
all_venues = [
    'airline',
    'club',  # Generally private social/dinner clubs.
    'commercial',
    'edu',  # Generally private event
    'government',  # Generally private event
    'hotel',
    'military',  # Generally private event
    'null',
    'patriotic',  # Generally private event
    'private',
    'professional',  # Generally private event
    'railroad',
    'religious',  # Generally private event
    'restaurant',
    'royal',  # Generally private event
    'ship',
    'social',  # Often a banquet, ball, anniversary, annual meeting, or prof event.
    '',  # (blank)
]
    
public_sponsors = ['hotel', 'rail', 'restaurant', 'ship', '']
public_events = ['breakfast', 'brunch', 'daily menu', 'diner', 'dinner', 'lunch', 'supper', 'wine', '']
public_venues = [ 'airline', 'commercial', 'hotel', 'railroad', 'restaurant', 'ship', '']
quote = lambda x: '\'' + x + '\''
public_sponsors_as_sql_in = ''.join(['(', ','.join([quote(x) for x in public_sponsors]), ')'])
public_events_as_sql_in =  ''.join(['(', ','.join([quote(x) for x in public_events]), ')'])
public_venues_as_sql_in =  ''.join(['(', ','.join([quote(x) for x in public_venues]), ')'])

In [27]:
# u1_query = """
# WITH AllVenues AS (
#     SELECT *
#     FROM
#       nypl_menu_step_2_sql_export_demo AS Menu INNER JOIN
#       menuPage ON Menu.id = menuPage.id INNER JOIN
#       menuItem ON menuPage.id = menuItem.menu_page_id INNER JOIN
#       Dish ON Dish.id = menuItem.dish_id
#     WHERE
#       Dish.name IS NOT NULL AND
#       Dish.name LIKE '%{0}%'
#     LIMIT 200
# ),
# AllPublicVenus AS (
#     SELECT *
#     FROM
#         AllVenues
#     WHERE
#         AllVenues.sponsor IN {1} AND
#         AllVenues.event IN {2} AND
#         AllVenues.venue IN {3}
# )

# ;
# """
u1_query = """
WITH AllVenues AS (
    SELECT *
    FROM
      MenuJoined
    WHERE
      name IS NOT NULL AND
      name LIKE '%{0}%'
    LIMIT 1000
),
AllPublicVenues AS (
    SELECT *
    FROM
        AllVenues
    WHERE
        AllVenues.sponsor IN {1} OR
        AllVenues.event IN {2} OR
        AllVenues.venue IN {3}
),
-- Now we focus on price data. Condense the 
PriceData AS (
    SELECT
        IIF(price IS NOT NULL, price, 0) AS price, 
        IIF(high_price IS NOT NULL, high_price, 0) AS high_price,
        IIF(lowest_price IS NOT NULL, lowest_price, 0) AS lowest_price, 
        IIF(highest_price IS NOT NULL, highest_price, 0) AS highest_price
    FROM
        AllPublicVenues
),
AveragePriceData AS (
    SELECT
        ROUND(AVG(price), 2) AS avg_price,
        ROUND(AVG(high_price), 2) AS avg_high,
        ROUND(AVG(lowest_price), 2) AS avg_lowest,
        ROUND(AVG(highest_price), 2) AS avg_highest
    FROM
        PriceData
),
Results AS (
    SELECT
        *
    FROM
        AllPublicVenues CROSS JOIN
        AveragePriceData
)
SELECT
    name, sponsor, event, venue, avg_price, avg_high, avg_lowest, avg_highest, last_appeared
FROM
    Results
ORDER BY
    last_appeared
DESC
;
"""
u1_query_fmt = u1_query.format(target_foods[0], public_sponsors_as_sql_in, public_events_as_sql_in, public_venues_as_sql_in)

res = cur.execute(u1_query_fmt)
print("name, sponsor, event, venue, avg_price, avg_high, avg_lowest, avg_highest, last_appeared")
for i, r in enumerate(res.fetchall()[:10]):
    print(f'{i}: {r}')

name, sponsor, event, venue, avg_price, avg_high, avg_lowest, avg_highest, last_appeared
0: ('boiled eggs', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
1: ('scrambled eggs', None, 'dinner', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
2: ('fried eggs', None, 'dinner', 'social', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
3: ('boiled eggs', 'hotel', 'daily menu', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
4: ('boiled eggs', None, 'lunch', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
5: ('fried eggs', None, 'lunch', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
6: ('scrambled eggs', None, 'lunch', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
7: ('poached eggs', None, 'lunch', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
8: ('fried eggs', 'ship', 'supper', 'commercial', 0.34, 0.0, 0.11, 8.79, '1987-01-01T00:00:00Z')
9: ('boiled eggs', None, 'dinner', 'edu', 

In [None]:
# Uncleaned same query 
# u1_unclean_query_fmt = u1_query.format(target_foods[0], public_sponsors_as_sql_in, public_events_as_sql_in, public_venues_as_sql_in)
# 
# res = ucur.execute(u1_unclean_query_fmt)
# print("name, sponsor, event, venue, avg_price, avg_high, avg_lowest, avg_highest, last_appeared")
# for i, r in enumerate(res.fetchall()[:10]):
#     print(f'{i}: {r}')

## Validation Queries

Please put Step \#5 here.

In [None]:
# Menu
# Here we show that the number of rows returned for a query with
#   1. a sponsor type containing the word "club"
#   2. a event or sponsor field containing the regex [Dd]inn for `dinner`
# is lower for cleaned data than uncleaned data.
# In the Menu dataset a sponsor type containing the word "club" is almost
# always a private event. E.g. a dinner for a local badminton club. In the cleaned
# dataset we filter over 50% of the misleading club results.
# The remaining clubs in the cleaned data appear to refer to nightclubs or dinner clubs
# that are more frequently open to the public.
menu_valiation_query_1 = """
    WITH ClubPublicVenues AS (
        SELECT *
        FROM
            MenuJoined
        WHERE
            sponsor LIKE '%club%' AND 
            (sponsor LIKE '%dinn%' OR
             sponsor LIKE '%Dinn%' OR
             event LIKE '%dinn%' OR
             event LIKE '%Dinn%')
    )
    SELECT COUNT(*)
    FROM 
        ClubPublicVenues
"""

# Result on cleaned dataset.
menu_valiation_query_fmt_1 = menu_valiation_query_1.format()
print_query(cur, menu_valiation_query_fmt_1)
    
# Result on uncleaned dataset.
menu_valiation_query_fmt_1 = menu_valiation_query_1.format()
print_query(ucur, menu_valiation_query_fmt_1)

In [None]:
# Menu Validation
# Here we show that the number of rows with sponsor + event + venue
# all NULL has increased in the cleaned data. This a a result of clutering
# by the sponsor + event + event fields into a smaller subset of types/enums/categories
# the clustering and then thresholding/removal of the long tail of types that
# are referenced by a handful of row (e.g. < 10) makes the place data more accurate.
# 
# More accurate place data helps our use case U1 by only returning relevant public
# venues when the user enters a dish-name query.
menu_valiation_query_2 = """
    WITH NullPlaceInformation AS (
        SELECT *
        FROM
            MenuJoined
        WHERE
            (sponsor = '' OR sponsor IS NULL) AND
            (event = '' OR event IS NULL) AND
            (venue = '' OR venue IS NULL)
    )
    SELECT COUNT(*)
    FROM 
        NullPlaceInformation
"""
# Result on cleaned dataset.
menu_valiation_query_fmt_2 = menu_valiation_query_2.format()
print_query(cur, menu_valiation_query_fmt_2)
    
# Result on uncleaned dataset.
menu_valiation_query_fmt_2 = menu_valiation_query_2.format()
print_query(ucur, menu_valiation_query_fmt_2)

In [None]:
# Menu Validation

# Here we show that a set of handpicked private events that all contain
# the regex "[Dd]inner" in the `event` column do not appear in the cleaned
# dataset because they do not refer to private events.
# The cleaned data correctly clusters these rows as probably not private.
menu_valiation_query_3 = """
    WITH PrivateDinnerEvents AS (
        SELECT *
        FROM
            MenuJoined
        WHERE
            --(event LIKE '%dinne%' OR event LIKE '%Dinne%' OR event LIKE '%DINNER%') AND
            MenuId IN (22770, 25288, 21689, 21744)
    )
    SELECT *
    FROM 
         PrivateDinnerEvents
    GROUP BY
        MenuId
"""
# Result on cleaned dataset.
print("Cleaned")
menu_valiation_query_fmt_3 = menu_valiation_query_3.format()
print_query(cur, menu_valiation_query_fmt_3, limit=10)
print('\n\n')
    
# Result on uncleaned dataset.
print("Uncleaned")
menu_valiation_query_fmt_3 = menu_valiation_query_3.format()
print_query(ucur, menu_valiation_query_fmt_3, limit=10)

# Looking at the results below we can see that both queries return the specified
# rows because in both cases the event field contains DINNER, Dinner, or dinner.
#
# Comparing the sponsor, and venues field though
# public_sponsor = ['hotel', 'rail', 'restaurant', 'ship', '']
# public_venues = [ 'airline', 'commercial', 'hotel', 'railroad', 'restaurant', 'ship', '']
#
# The cleaned rows have `sponsor` of type military, political, and social, which are not
# included in the list of public sponsor types used in U1.

In [None]:
# Menu Validation

# TBD uppercase vs. lowercase, underscores, punctuation differences in search. dinner vs DINNER.

# Filtering

In [None]:
# The following sections enable individual filtering steps.
FILTER_RM_TITLES_WITHOUT_YEARS = True
MUTATION_ADD_YEAR_COL = True

# All filters/mutations will be applied to the fdf (filtered data frame). The original will be preserved.
fdf = df.copy()

In [None]:
titles = fdf['title']
titles[:100]

In [None]:
# Find titles with the year in them
total_titles = fdf['title'].size
null_titles = fdf.title.isnull().sum()
title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
num_titles_with_year = title_with_year.sum()
pct_with_year = num_titles_with_year / (total_titles - null_titles)
print(f'total: {total_titles}, nulls: {null_titles},  '
      f'num_with_year: {num_titles_with_year}, % w/yr: {pct_with_year:0.4}%')

In [None]:
if FILTER_RM_TITLES_WITHOUT_YEARS:
    fdf = fdf[fdf['title'].notna()]
    title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
    fdf = fdf[title_with_year]
    
if MUTATION_ADD_YEAR_COL:
    if FILTER_RM_TITLES_WITHOUT_YEARS:
        fdf['year'] = fdf.title.str.extract(pat='([012]\d{2,3})', expand=False)
        #fdf['year'] = pd.to_datetime(fdf['year'], format='%y', errors='raise')
        fdf['year'] = fdf['year'].astype('int32')