In [3]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder

In [62]:
train = pd.read_json("data/train.json")
test = pd.read_json("data/test.json")

In [60]:
train_expanded = train.explode('ingredients')
train_expanded.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,romaine lettuce
0,10259,greek,black olives
0,10259,greek,grape tomatoes
0,10259,greek,garlic
0,10259,greek,pepper


In [61]:
train.head().explode("ingredients").explode("ingredients").sample(10)

Unnamed: 0,id,cuisine,ingredients
4,13162,indian,black pepper
4,13162,indian,onions
2,20130,filipino,chicken livers
4,13162,indian,butter
2,20130,filipino,eggs
0,10259,greek,romaine lettuce
1,25693,southern_us,tomatoes
0,10259,greek,garbanzo beans
4,13162,indian,boneless chicken skinless thigh
2,20130,filipino,yellow onion


In [91]:
df = train.sample(10).explode("ingredients")
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[['ingredients']])

OneHotEncoder(handle_unknown='ignore')

In [109]:
train_encoded, encoder1 = encode_recipes(train.sample(10))
test_encoded, encoder2  = encode_recipes(test.sample(10), enc=encoder1)

In [110]:
train_encoded.head()

Unnamed: 0,id,acini di pepe,arborio rice,asparagus,baguette,bay leaf,beef tenderloin steaks,bell pepper,black beans,black pepper,...,vegetable oil spray,water,white beans,"whole kernel corn, drain",whole milk,whole wheat tortillas,yellow bell pepper,yellow mustard seeds,zucchini,cuisine
0,2610,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,italian
1,9369,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,japanese
2,13110,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,italian
3,27777,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,italian
4,34961,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,italian


In [111]:
test_encoded.head()

Unnamed: 0,id,acini di pepe,arborio rice,asparagus,baguette,bay leaf,beef tenderloin steaks,bell pepper,black beans,black pepper,...,unsalted butter,vegetable oil spray,water,white beans,"whole kernel corn, drain",whole milk,whole wheat tortillas,yellow bell pepper,yellow mustard seeds,zucchini
0,7519,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,19875,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21753,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,25173,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,25948,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
encoder1 == encoder2

True

In [108]:
def encode_recipes(recipe_df, enc=None):
    ingredients_df = recipe_df.explode("ingredients")
    if enc:
        col_name = enc.feature_names_in_[0]
        ingredient_arr = enc.transform(ingredients_df[[col_name]]).toarray()
        encoded_ingredients = pd.DataFrame(data=ingredient_arr, 
                                      columns=enc.categories_[0], 
                                      dtype=bool, 
                                      index=ingredients_df['id']).reset_index()
        encoded_recipes = encoded_ingredients.groupby('id').any().astype(int).reset_index()
    else: 
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(ingredients_df[['ingredients']])
        return encode_recipes(recipe_df, enc=enc)
    if 'cuisine' in recipe_df.columns:
        encoded_recipes = encoded_recipes.merge(recipe_df[['id','cuisine']], on='id')
    return encoded_recipes, enc
        
        
def get_encoded_ingredients(recipe_df):
    "Take a dataframe with columns 'id' and 'ingredients' and one hot encode the ingredients"
    recipe_ingredient_df = recipe_df.explode('ingredients')
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(recipe_ingredient_df[['ingredients']])
    transformed = enc.transform(recipe_ingredient_df[['ingredients']]).toarray()
    transformed_df = pd.DataFrame(data=transformed, 
                                  columns=enc.categories_[0], 
                                  dtype=bool, 
                                  index=recipe_ingredient_df['id']).reset_index()
    recipe_ohe = transformed_df.groupby('id').any().astype(int).reset_index()
    return recipe_ohe, enc

def ohe_cuisine_recipes(recipe_with_cuisine):
    ohe_ingredients_df, _ = get_encoded_ingredients(recipe_with_cuisine)
    full_recipe_df = ohe_ingredients_df.merge(recipe_with_cuisine[['id','cuisine']], on='id')
    return full_recipe_df

    


In [66]:
train_sample_encoding = ohe_cuisine_recipes(train.head())
train_sample_encoding

Unnamed: 0,id,bay leaf,black olives,black pepper,boneless chicken skinless thigh,butter,cayenne pepper,chicken livers,chili powder,cooking oil,...,shallots,soy sauce,thyme,tomatoes,vegetable oil,water,wheat,yellow corn meal,yellow onion,cuisine
0,10259,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,greek
1,13162,1,0,1,1,1,1,0,1,0,...,1,0,0,0,0,1,0,0,0,indian
2,20130,0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,filipino
3,22213,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,indian
4,25693,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,1,0,southern_us


In [79]:
train.head()[['ingredients']].explode('ingredients').head()

Unnamed: 0,ingredients
0,romaine lettuce
0,black olives
0,grape tomatoes
0,garlic
0,pepper


In [75]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train.head()[['ingredients']].explode('ingredients'))

OneHotEncoder(handle_unknown='ignore')

In [78]:
enc.feature_names_in_[0]

'ingredients'

In [63]:
get_encoded_ingredients(test.head())

Unnamed: 0,id,all-purpose flour,andouille sausage,baking powder,bananas,boneless chicken skinless thigh,browning,corn starch,cornmeal,cream of tartar,...,smoked sausage,sugar,toasted pecans,vanilla extract,vanilla wafers,vegetable oil,water,white rice,white sugar,yellow onion
0,18009,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,28583,0,0,0,1,0,0,1,0,1,...,0,1,1,1,1,0,0,0,0,0
2,29752,1,1,0,0,1,1,0,0,0,...,1,0,0,0,0,1,1,1,0,1
3,35687,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,41580,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
train.shape, test.shape

((39774, 3), (9944, 2))

In [69]:
train_sample_encoding.to_csv("temp_train_encoding.csv",index=False)

In [70]:
pd.read_csv('temp_train_encoding.csv')

Unnamed: 0,id,bay leaf,black olives,black pepper,boneless chicken skinless thigh,butter,cayenne pepper,chicken livers,chili powder,cooking oil,...,shallots,soy sauce,thyme,tomatoes,vegetable oil,water,wheat,yellow corn meal,yellow onion,cuisine
0,10259,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,greek
1,13162,1,0,1,1,1,1,0,1,0,...,1,0,0,0,0,1,0,0,0,indian
2,20130,0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,filipino
3,22213,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,indian
4,25693,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,1,0,southern_us


## Test Python Output

In [117]:
train_enc = pd.read_csv('data/ohe_train_recipes_v1.csv')

In [145]:
train.sort_values(by='id')

Unnamed: 0,id,cuisine,ingredients
31571,0,spanish,"[mussels, ground black pepper, garlic cloves, ..."
25290,1,mexican,"[tomatoes, diced red onions, paprika, salt, co..."
23447,2,french,"[chicken broth, truffles, pimentos, green pepp..."
19464,3,chinese,"[fresh ginger, sesame oil, frozen peas, cooked..."
19690,4,italian,"[orange peel, cookies, vanilla ice cream, gran..."
...,...,...,...
37475,49712,indian,"[garam masala, salt, ground cumin, finely chop..."
13604,49713,southern_us,"[black pepper, large eggs, creole seasoning, r..."
24295,49714,irish,"[chopped fresh thyme, chopped fresh sage, onio..."
20646,49716,indian,"[water, cinnamon, garlic, cardamom, onions, cl..."


In [155]:
set(train['id']) == set(train_enc['id'])

True

In [156]:
train_enc.head()

Unnamed: 0,id,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,...,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms,cuisine
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spanish
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,mexican
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,french
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,chinese
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,italian


In [157]:
train_enc.head().drop(columns=['id','cuisine']).sum()

(    oz.) tomato sauce                                      0
(   oz.) tomato paste                                       0
(10 oz.) frozen chopped spinach                             0
(10 oz.) frozen chopped spinach, thawed and squeezed dry    0
(14 oz.) sweetened condensed milk                           0
                                                           ..
zesty italian dressing                                      0
zinfandel                                                   0
ziti                                                        0
zucchini                                                    0
zucchini blossoms                                           0
Length: 6714, dtype: int64

In [125]:
train_enc.head().drop(columns=['id','cuisine'])

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,...,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
train_enc.head().drop(columns=['id','cuisine']).sum().sort_values()

(    oz.) tomato sauce          0
parsley                         0
parmigiano-reggiano cheese      0
parmigiano reggiano cheese      0
parmigiano                      0
                               ..
orange peel                     1
ground chipotle chile pepper    1
garlic                          2
salt                            2
bell pepper                     2
Length: 6714, dtype: int64

In [130]:
train_enc.head()['garlic']

0    0
1    1
2    0
3    1
4    0
Name: garlic, dtype: int64

In [137]:
(train_enc.drop(columns=['id','cuisine']) > 1).sum().value_counts()

0    6714
dtype: int64

In [142]:
((train_enc.drop(columns=['id','cuisine']) == 1).sum() <4 ).value_counts()

False    3675
True     3039
dtype: int64