In [1]:
import pickle
import sys
import numpy as np
import pandas as pd

## PATHS

In [4]:
# inputs
ing_in_input = 'data/join/ing_in_table.pkl'
usda_input = 'data/join/usda.pkl'
recipes_input = 'data/raw/full_recipes.pkl'
# outputs
ing_in_output = 'data/tables/ing_in.pkl'
recipes_output = 'data/tables/recipes.pkl'
usda_output = 'data/tables/usda.pkl'

## Load Inputs

In [23]:
def load(path):
    return pickle.load(open(path, 'rb'))

ing_in = load(ing_in_input)
usda = load(usda_input)
recipes = load(recipes_input)

Remove all usda ingredients from usda that do not have any matches in ingredients in. Drops about 3k ingredients

In [24]:
feat_ings = ing_in['usda_id']
feated = usda.index.isin(feat_ings.values)
usda = usda.loc[feated, :]

Drop unnecessary nutrition columns

In [25]:
usda.drop(columns=['Shrt_Desc', 'Water_(g)', 'Lipid_Tot_(g)',
       'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)',
       'Potassium_(mg)', 'Zinc_(mg)', 'Copper_mg)',
       'Manganese_(mg)', 'Selenium_(µg)', 'Vit_C_(mg)', 'Thiamin_(mg)',
       'Riboflavin_(mg)', 'Niacin_(mg)', 'Panto_Acid_mg)', 'Vit_B6_(mg)',
       'Folate_Tot_(µg)', 'Folic_Acid_(µg)', 'Food_Folate_(µg)',
       'Folate_DFE_(µg)', 'Choline_Tot_ (mg)', 'Vit_B12_(µg)', 'Vit_A_IU',
       'Vit_A_RAE', 'Retinol_(µg)', 'Alpha_Carot_(µg)', 'Beta_Carot_(µg)',
       'Beta_Crypt_(µg)', 'Lycopene_(µg)', 'Lut+Zea_ (µg)', 'Vit_E_(mg)',
       'Vit_D_µg', 'Vit_D_IU', 'Vit_K_(µg)', 'FA_Sat_(g)', 'FA_Mono_(g)',
       'FA_Poly_(g)', 'Refuse_Pct'], inplace=True)

In [26]:
usda.head()

Unnamed: 0_level_0,Energ_Kcal,Protein_(g),Sugar_Tot_(g),Sodium_(mg),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,desc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,717.0,0.85,0.06,643.0,215.0,5.0,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,butter with salt
1002,718.0,0.49,0.06,583.0,225.0,3.8,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,butter whipped with salt
1003,876.0,0.28,0.0,2.0,256.0,12.8,1 tbsp,205.0,1 cup,butter oil anhydrous
1004,353.0,21.4,0.5,1146.0,75.0,28.35,1 oz,17.0,1 cubic inch,blue cheese
1005,371.0,23.24,0.51,560.0,94.0,132.0,"1 cup, diced",113.0,"1 cup, shredded",brick cheese


Rename columns in usda

In [33]:
usda.rename(columns={'Energ_Kcal': 'cal', 'GmWt_1': 'hwt_1', 'GmWt_2': 'hwt_2', 'Protein_(g)': 'protein', 'Sugar_Tot_(g)': 'sugar', 'Sodium_(mg)': 'sodium', 'Cholestrl_(mg)': 'cholesterol', 'GmWt_Desc1' : 'house_1', 'GmWt_Desc2': 'house_2'}, inplace=True)

Make recipes into a dataframe

In [43]:
cols = {}
keys = []
colnames = ['title', 'instructions', 'picture_link', 'src']
for col in colnames:
    cols[col] = []
# iterate over recipes
for rid, recipe in recipes.items():
    keys.append(rid)
    for col in colnames:
        if col not in recipe or not recipe[col]:
            cols[col].append(np.NaN)
        else:
            cols[col].append(recipe[col])
# init df
recipes_df = pd.DataFrame(index=keys, data=cols)
recipes_df.head(200)
recipes_df['title'].unique()

array(['Slow Cooker Chicken and Dumplings',
       'Awesome Slow Cooker Pot Roast', 'Brown Sugar Meatloaf', ...,
       'Fresh Cheese with Spinach ',
       'Jalapeño and Lime–Marinated Skirt Steak Tacos ',
       'Semolina–Lemon Syrup Cakes '], dtype=object)

In [45]:
ing_in.columns

Index(['comment', 'display', 'unit', 'name', 'input', 'qty', 'other',
       'usda_id'],
      dtype='object')

## Save outputs

In [46]:
def save(path, obj):
    pickle.dump(obj, open(path, 'wb'))
    
save(recipes_output, recipes_df)
save(usda_output, usda)
save(ing_in_output, ing_in)