# Imports

In [None]:
import os
import time
from typing import List, Tuple, Dict, Optional

import pandas as pd
import numpy as np

# Load Data

In [None]:
_DISH_CSV_FILE = 'Dish.csv'
_MENU_CSV_FILE = 'Menu.csv'
_MENU_ITEM_CSV_FILE = 'MenuItem.csv'
_MENU_PAGE_CSV_FILE = 'MenuPage.csv'
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _DISH_CSV_FILE)
dish_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_CSV_FILE)
menu_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_ITEM_CSV_FILE)
mi_df = pd.read_csv(path)
path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'NYPL-menus', _MENU_PAGE_CSV_FILE)
mp_df = pd.read_csv(path)

In [None]:
dish_df

In [None]:
menu_df

In [None]:
mi_df

In [None]:
#mp_df['page_number'] = mp_df['page_number'].astype('int')
print(f"null pages: {mp_df['page_number'].isnull().sum()}")
mp_df

# Joining

In [None]:
#all_items = pd.concat([mp_df.set_index('id'), mi_df.set_index('menu_page_id')], axis=1, join='inner')
all_pages = mp_df.merge(mi_df, left_on='id', right_on='menu_page_id', how='inner').reset_index(drop=True)
all_pages = all_pages.drop(labels=['id_x', 'id_y'], axis='columns')
all_pages

In [None]:
all_menus = all_pages.merge(menu_df, left_on='menu_id', right_on='id', how='inner').reset_index(drop=True)
all_menus = all_menus.drop(labels=['id'], axis='columns')
all_menus

In [None]:
all_items = all_menus.merge(dish_df, left_on='dish_id', right_on='id', how='inner')
all_items = all_items.drop(labels=['id'], axis='columns')
print(all_items.columns)
all_items

# Filtering

In [None]:
# The following sections enable individual filtering steps.
FILTER_RM_TITLES_WITHOUT_YEARS = True
MUTATION_ADD_YEAR_COL = True

# All filters/mutations will be applied to the fdf (filtered data frame). The original will be preserved.
fdf = df.copy()

In [None]:
titles = fdf['title']
titles[:100]

In [None]:
# Find titles with the year in them
total_titles = fdf['title'].size
null_titles = fdf.title.isnull().sum()
title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
num_titles_with_year = title_with_year.sum()
pct_with_year = num_titles_with_year / (total_titles - null_titles)
print(f'total: {total_titles}, nulls: {null_titles},  '
      f'num_with_year: {num_titles_with_year}, % w/yr: {pct_with_year:0.4}%')

In [None]:
if FILTER_RM_TITLES_WITHOUT_YEARS:
    fdf = fdf[fdf['title'].notna()]
    title_with_year = fdf['title'].str.contains('[012]\d{2,3}')
    fdf = fdf[title_with_year]
    
if MUTATION_ADD_YEAR_COL:
    if FILTER_RM_TITLES_WITHOUT_YEARS:
        fdf['year'] = fdf.title.str.extract(pat='([012]\d{2,3})', expand=False)
        #fdf['year'] = pd.to_datetime(fdf['year'], format='%y', errors='raise')
        fdf['year'] = fdf['year'].astype('int32')

In [None]:
fdf