# Branded food data frame analysis

In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy.stats import ks_2samp

In [62]:
filepath = lambda x: os.path.join('data', x)

In [63]:
branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={
    'brand_owner':str,
    'brand_name':str,
    'subbrand_name':str,
    'gtin_upc':str})
branded_food.head()

  branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={


Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,,,,
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,,,,
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,,,,


In [64]:
branded_food.fdc_id.is_unique

True

In [65]:
branded_food.memory_usage(deep=True)

Index                                128
fdc_id                          14762376
brand_owner                    142327530
brand_name                     105382109
subbrand_name                   62118864
gtin_upc                       127463305
ingredients                    623353099
not_a_significant_source_of     68469344
serving_size                    14762376
serving_size_unit              106885302
household_serving_fulltext      84118618
branded_food_category          143595828
data_source                    109068029
package_weight                  84714471
modified_date                  123634199
available_date                 123634899
market_country                 129168556
discontinued_date               14762376
preparation_state_code          60420254
trade_channel                   59838434
short_description               60965509
dtype: int64

In [66]:
branded_food.memory_usage(deep=True).sum()

2259445606

Going through the initial inspection of the dataset:

- Missingness/null values
- incorrect dtypes
- Reducing of columns/memory if possible]
- Validation of data (outlier searching and correction of incorrect values)

## Initial Missingness check

In [67]:
branded_food.isnull().mean()

fdc_id                         0.000000
brand_owner                    0.007813
brand_name                     0.296762
subbrand_name                  0.952827
gtin_upc                       0.000000
ingredients                    0.002923
not_a_significant_source_of    0.960171
serving_size                   0.005842
serving_size_unit              0.010312
household_serving_fulltext     0.585412
branded_food_category          0.005729
data_source                    0.000000
package_weight                 0.617388
modified_date                  0.000011
available_date                 0.000000
market_country                 0.000000
discontinued_date              1.000000
preparation_state_code         0.978783
trade_channel                  0.991557
short_description              0.978720
dtype: float64

We see that there are some columns with missing data, mainly towards the end, including discontinued_date, preparation_state_code, trade_channel, and short_description, which we will take a look through non-null examples and drop if necessary to preserve memory.

In [68]:
has_prep_code = branded_food[branded_food.preparation_state_code.notnull()]
has_prep_code.head()

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
1549669,2219410,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,642205546077,"Turkey, Natural Flavoring",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1 LBR,2019-03-07,2022-02-10,United States,,UNPREPARED,,HSW Fh 93% Grd Tky Chub 12/1
1549670,2219411,Cargill Incorporated/Honeysuckle White,Honeysuckle White,,642205534517,"All Natural White Turkey, Natural Flavoring",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1.25 LBR,2020-02-04,2022-02-10,United States,,UNPREPARED,,HSW Fh Gr WhtDry Ex Wt 6/1.25#
1549671,2219412,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,642205534500,"All Natural Turkey, Natural Flavoing",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1.25 LBR,2020-02-05,2022-02-10,United States,,UNPREPARED,,HSW Fh 85/15 Gr tky Ex Wt 6/1.25
1549672,2219413,Kellogg Company US,Kellogg's Pop-Tarts,,38000317101,"Enriched flour (wheat flour, niacin, reduced i...",,52.0,g,1 Pastry,Sweet Bakery Products,GDSN,14.7 ONZ,2019-04-09,2022-02-10,United States,,UNPREPARED,,Pop-Tarts
1549673,2219414,Kellogg Company US,Kellogg's Cheez It,,24100105236,"Enriched flour (wheat flour, niacin, reduced i...",,25.0,g,1 Pouch,Biscuits/Cookies,GDSN,12.6 ONZ,2019-04-30,2022-02-10,United States,,UNPREPARED,,Gripz Crackers


In [69]:
has_prep_code.preparation_state_code.value_counts()

preparation_state_code
UNPREPARED        26631
PREPARED           5676
READY_TO_EAT       2830
READY_TO_DRINK     2677
BAKE                603
HEAT_AND_SERVE      272
THAW                147
FREEZE               97
GRILL                95
CONVECTION           36
UNSPECIFIED          22
FRY                  15
STEAM                12
DEEP_FRY             12
ROAST                11
BOIL                  7
MICROWAVE             4
STIR_FRY              4
Name: count, dtype: int64

We see that many of these are with respect to how to prepare the food for consumption, many of which are simply labeled "unprepared" or prepared. We also see that many of these preparation codes are disambiguous, as unprepared does contain also foods that would be considered ready to eat, such as Pop-Tarts or Cheez-its. Because of this, it may be hard to categorize food into explicitly "prepared" and "unprepared" categories. Therefore, we will not touch the column any further.

In [70]:
has_prep_code[has_prep_code.preparation_state_code == 'UNPREPARED'].head(5)

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
1549669,2219410,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,642205546077,"Turkey, Natural Flavoring",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1 LBR,2019-03-07,2022-02-10,United States,,UNPREPARED,,HSW Fh 93% Grd Tky Chub 12/1
1549670,2219411,Cargill Incorporated/Honeysuckle White,Honeysuckle White,,642205534517,"All Natural White Turkey, Natural Flavoring",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1.25 LBR,2020-02-04,2022-02-10,United States,,UNPREPARED,,HSW Fh Gr WhtDry Ex Wt 6/1.25#
1549671,2219412,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,642205534500,"All Natural Turkey, Natural Flavoing",,112.0,g,4 oz.,Meat/Poultry/Other Animals Unprepared/Unproce...,GDSN,1.25 LBR,2020-02-05,2022-02-10,United States,,UNPREPARED,,HSW Fh 85/15 Gr tky Ex Wt 6/1.25
1549672,2219413,Kellogg Company US,Kellogg's Pop-Tarts,,38000317101,"Enriched flour (wheat flour, niacin, reduced i...",,52.0,g,1 Pastry,Sweet Bakery Products,GDSN,14.7 ONZ,2019-04-09,2022-02-10,United States,,UNPREPARED,,Pop-Tarts
1549673,2219414,Kellogg Company US,Kellogg's Cheez It,,24100105236,"Enriched flour (wheat flour, niacin, reduced i...",,25.0,g,1 Pouch,Biscuits/Cookies,GDSN,12.6 ONZ,2019-04-30,2022-02-10,United States,,UNPREPARED,,Gripz Crackers


In [71]:
branded_food = branded_food.drop(columns=['discontinued_date', 'preparation_state_code', 'trade_channel', 'short_description'])

## GTIN UPC cleaning 

GTIN UPC is a common format/code used in many different PoS stores, which is extended to any item that has a barcode associated with it. On further looking at the data, we had to specify that the upc is stored as a string, as there are many examples where there are leading 0's as shown. Further looking shows that there are sometimes dashes, which we will remove to make the string more standardized.

In [72]:
branded_food.gtin_upc = branded_food.gtin_upc.str.replace('-', '')

## Serving size cleaning

In [73]:
branded_food.serving_size_unit.value_counts()

serving_size_unit
g      1522350
ml      243399
GRM      40597
MLT       8288
MG        7555
IU        3671
GM         346
MC          63
Name: count, dtype: int64

We see that there are several options. We will look up what these units exactly mean (unabbreviated), and possibly merge any containing the same amount. We can also visualize the distributions of respective foods.

- g (gram)
- ml (mililiter, most likely for fluids)
- grm - unknown, will compare to gram's distribution for differences
- mlt - Unknwon
- MG - possibly a milligram distribution
- IU - 
- GM - possible alias for gram as well

In [74]:
inspect_unit = lambda x: branded_food[branded_food.serving_size_unit == x]

In [75]:
grm_foods = inspect_unit('GRM')
grm_foods

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country
1751470,2456687,SCHWAN'S FOOD SERVICE INC,TONY'S,,10072180726718,"INGREDIENTS: FRENCH BREAD (WATER, WHITE WHOLE ...",,156.0,GRM,1 Pizza (156g),Pies/Pastries/Pizzas/Quiches - Savoury (Frozen),GDSN,60 EA,2022-12-15,2023-01-26,United States
1751471,2456688,Bake Crafters Food Company,Bake Crafters,,00737410335001,"Enriched Wheat Flour [Wheat Flour, Malted Barl...",,28.0,GRM,1 oz (28g),Bread (Frozen),GDSN,6.25 LBR,2022-06-02,2023-01-26,United States
1751472,2456689,Bake Crafters Food Company,Bake Crafters,,00737410171708,"Whole Wheat Flour, Enriched Bleached Wheat Flo...",,78.0,GRM,"2.75 oz (78g), 4 pieces",Desserts (Frozen),GDSN,12.375 LBR,2022-06-02,2023-01-26,United States
1751473,2456690,Bake Crafters Food Company,Bake Crafters,,00737410158105,"Water, Whole Wheat Flour, Enriched Wheat Flour...",,40.0,GRM,"1.4 oz (40g), 2 Pancakes",Bread (Frozen),GDSN,14.175 LBR,2022-06-02,2023-01-26,United States
1751474,2456691,Brakebush Brothers,Brakebush,,10038034558706,UNCOOKED BONELESS CHICKEN BREAST TENDERS CONTA...,,71.0,GRM,1 Piece,Chicken - Prepared/Processed,GDSN,10 LBR,2022-04-26,2023-01-26,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1845290,2554908,Tanjoe Enterprises Inc.,LIDIA'S,,897712001049,"ITALIAN TOMATOES (CITRIC ACID), ARTICHOKES, CA...",,125.0,GRM,1/2 cup,Prepared Pasta & Pizza Sauces,LI,25 oz/708 g,2023-03-27,2023-05-25,United States
1845291,2554909,Oregon Growers & Shippers LLC,OREGON GROWERS,,898271000948,"BLACKBERRIES (MARIONBERRIES, BLACKBERRIES), CA...","Not a significant source of saturated fat, tra...",39.0,GRM,2 Tbsp,Syrups & Molasses,LI,8 fl oz/237 mL,2023-04-26,2023-05-25,United States
1845292,2554910,Mt. Garfield Winery Corp,LIFESTYLEFOODS,,898425002682,ARCADIAN HARVEST LETTUCE (BLEND OF LEAF LETTUC...,,163.0,GRM,,"Pickles, Olives, Peppers & Relishes",LI,5.75 oz./163 g,2023-03-10,2023-05-25,United States
1845295,2554913,"Ittella International, Inc.",TATTOOED CHEF,,899764001527,"CAULIFLOWER, CORN FLOUR, GRANA PADANO CHEESE (...",,71.0,GRM,1 pc,Frozen Patties and Burgers,LI,10 oz/283 g,2023-03-23,2023-05-25,United States


One thing to note is the serving size amount of grm corresponding to the household serving fulltext. We can see many examples where the gram amount corresponds to what is labeled in household serving fulltext:

In [76]:
(
    grm_foods[grm_foods.household_serving_fulltext.str.contains('g)', regex=False).fillna(False)]
    [['serving_size', 'serving_size_unit', 'household_serving_fulltext']]
)

Unnamed: 0,serving_size,serving_size_unit,household_serving_fulltext
1751470,156.0,GRM,1 Pizza (156g)
1751471,28.0,GRM,1 oz (28g)
1751472,78.0,GRM,"2.75 oz (78g), 4 pieces"
1751473,40.0,GRM,"1.4 oz (40g), 2 Pancakes"
1751477,34.0,GRM,"1.19 oz (34g), 1 Bread Stick"
...,...,...,...
1841730,4.0,GRM,2 Tbsp (4g)
1841731,79.0,GRM,"2.8 oz (79g), 1 Biscuit"
1842987,61.0,GRM,2/3 cup dry mix (61g) (1cup prepared)
1842989,61.0,GRM,1 link (g)


From this we can most likely consider **grm** as synonymous with the **g** abbreviation. 

In [77]:
g_foods = inspect_unit('g')
g_foods[g_foods.household_serving_fulltext.str.contains('g)', regex=False).fillna(False)][['serving_size', 'serving_size_unit', 'household_serving_fulltext']]

Unnamed: 0,serving_size,serving_size_unit,household_serving_fulltext
34110,17.0,g,1 Tbsp (17g)
34285,17.0,g,1 Tbsp(17g)
34382,62.0,g,1/4 cup (62g)
34383,62.0,g,1/4 cup (62g)
34384,62.0,g,1/4 cup (62g)
...,...,...,...
1757337,54.0,g,3 sheets (3x18g) dry
1759907,85.0,g,1 cup salad only/ 1 cup dressed salad with top...
1760115,85.0,g,1 cup salad only/1 cup dressed salad with topp...
1776928,85.0,g,"1 Cup Vegetables (85 g), 4 pieces Sausage (16 ..."


If we only look at these examples, we would consider them as conventionally the same. However, they may not be the same exactly. One way we can possibly further reinforce this idea is through a permutation test; if we assume that they're from the same distribution, then the two distributions should be similar to each other.

## Gram permutation test

Because the amount of foods there are between the two, we will instead choose to sample the distributions to about 10,000 for each distribution. We will also eliminate some of the outliers, as we noticed from the descriptive statistics (mainly the max) of g foods.

In [78]:
def remove_outlier(data, column):
    # Returns a dataset without outlier rows in given column. Outliers are determined via IQR range.
    data_stats = data[column].describe()
    data_iqr = data_stats['75%'] - data_stats['25%']
    data_lb = data_stats['25%'] - data_iqr * 1.5
    data_ub = data_stats['75%'] + data_iqr * 1.5
    
    new_data = data[(data[column] >= data_lb) & (data[column] <= data_ub)]
    return new_data

In [79]:
def pprint_ks(ks_test):
    print(f"""KS Test statistic: {ks_test.statistic}
P-Value: {ks_test.pvalue}
Statistic location: {ks_test.statistic_location}""")

We see that we achieve a p value that heavily exceeds our alpha value, and reject the hypothesis that foods labeled g or grm come from the same distribution.

## Testing similarities between serving_size_units

We can run a for loop to further analyze for any similar serving_size distributions.

In [80]:
branded_food.serving_size_unit.unique()[0]

'ml'

In [81]:
ss_units = branded_food.serving_size_unit.dropna().unique()
alpha = .05 

similar_dists = []
# Test every possible unit conversion (skipping tests we've already done)
for i in range(len(ss_units)):
    for j in range(len(ss_units)):
        if i >= j:
            # We will have already tested this hypothesis
            continue
        unit_1 = ss_units[i]
        unit_2 = ss_units[j]
        
        
        ks_test = ks_2samp(inspect_unit(unit_1)['serving_size'], inspect_unit(unit_2)['serving_size'])
        print(f"Testing {unit_1} vs {unit_2}: p-val {ks_test.pvalue}")
        if ks_test.pvalue > alpha:
            # Similar distribution found
            pprint_ks(ks_test)
            similar_dists.append((unit_1, unit_2))
similar_dists

Testing ml vs g: p-val 0.0
Testing ml vs GRM: p-val 0.0
Testing ml vs MLT: p-val 2.2253898880348562e-160
Testing ml vs MG: p-val 0.0
Testing ml vs IU: p-val 0.0
Testing ml vs GM: p-val 2.557616601803372e-96
Testing ml vs MC: p-val 1.3826597053583473e-09
Testing g vs GRM: p-val 1.4048946514233735e-71
Testing g vs MLT: p-val 0.0
Testing g vs MG: p-val 3.6425295794396665e-58
Testing g vs IU: p-val 4.0657030432436687e-23
Testing g vs GM: p-val 0.00015135764194947168
Testing g vs MC: p-val 3.577029135725556e-17
Testing GRM vs MLT: p-val 0.0
Testing GRM vs MG: p-val 3.9595694919398624e-101
Testing GRM vs IU: p-val 7.17355034230616e-17
Testing GRM vs GM: p-val 0.0015708871022459892
Testing GRM vs MC: p-val 7.306319845101319e-17
Testing MLT vs MG: p-val 0.0
Testing MLT vs IU: p-val 1.21e-321
Testing MLT vs GM: p-val 1.4011715032057184e-121
Testing MLT vs MC: p-val 2.4289547311209214e-13
Testing MG vs IU: p-val 8.176478393629537e-49
Testing MG vs GM: p-val 3.7126187901109946e-11
Testing MG vs M

[]

We see that none of the units are from the same distribution, and many have extremely low p values.

## Further memory reducing

If we want to be able to reduce the amount of memory required to hold the entire data frame. We will do this be attempting to reduce redundant information even further.

In [82]:
branded_food.memory_usage(deep=True)

Index                                128
fdc_id                          14762376
brand_owner                    142327530
brand_name                     105382109
subbrand_name                   62118864
gtin_upc                       127463091
ingredients                    623353099
not_a_significant_source_of     68469344
serving_size                    14762376
serving_size_unit              106885302
household_serving_fulltext      84118618
branded_food_category          143595828
data_source                    109068029
package_weight                  84714471
modified_date                  123634199
available_date                 123634899
market_country                 129168556
dtype: int64

In [83]:
branded_food.head()

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States


We can look at many string based columns to see if there is a better way to possibly reduce memory, as strings are usually unique, immutable, and take a lot of memory. 

In [84]:
branded_food.market_country.value_counts(dropna=False)

market_country
United States    1844180
New Zealand         1117
Name: count, dtype: int64

The market country column shows that many entries repeat United States, when a better approach may just be to one-hot encode whether the food is from the us or not. Since there are only two options, we can mark whether the food is for the american market with a 1, or for the new zealand market with a 0. This reduces memory usage from over 120 million bytes to simply 1 million bytes!

In [85]:
branded_food.market_country = (branded_food.market_country == 'United States').astype(np.int8)

In [86]:
branded_food.market_country.memory_usage(deep=True)

1845425

Other memory optimizations is the conversion of the modified_date and available_date columns to datetime, which converts the issue of strings to simply individual objects, which reduces the bytes used tremendously as well; the byte count shrinks about 10x for both columns.

In [87]:
branded_food.modified_date = pd.to_datetime(branded_food.modified_date)
branded_food.available_date = pd.to_datetime(branded_food.available_date)

In [88]:
branded_food.modified_date.memory_usage(deep=True) # Same for available date

14762504

In [89]:
branded_food.dtypes

fdc_id                                  int64
brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                           int8
dtype: object

In [90]:
branded_food.data_source.memory_usage(deep=True)

109068157

In [91]:
branded_food.data_source.value_counts() # Easily one-hot encodeable into possibly three different categories

data_source
LI        1748661
GDSN        95519
NZGDSN       1117
Name: count, dtype: int64

In [92]:
pd.get_dummies(branded_food.data_source) # One hot encodes easily

Unnamed: 0,GDSN,LI,NZGDSN
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
...,...,...,...
1845292,False,True,False
1845293,False,True,False
1845294,False,True,False
1845295,False,True,False


In [93]:
branded_food = pd.concat([branded_food, pd.get_dummies(branded_food.data_source)], axis=1) # One hot encodes the data source column
branded_food.head()

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,GDSN,LI,NZGDSN
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,1,True,False,False
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,1,True,False,False
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,1,True,False,False
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,1,True,False,False
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,1,True,False,False


## Insignificant sources
There is a lot of redundancy in this column, as many repeat the same insignificant sources, as many of them are simply nutritional value-related.

In [94]:
# Get process insignificant sources (a lot more processing for a more standard convention)
insignificant_processed = (
    branded_food.not_a_significant_source_of
    .str.lower()
    # Cut off repetition (not a significant source)
    .str.replace(r'.*not( a)? significant source of:? ?', '', regex=True)
    # Fix comma spacing
    .str.replace(',? ?and,?', ',', regex=True)
    .str.replace('\s*,\s*', ',', regex=True)
    # Remove any periods
    .str.replace('.', '', regex=False)
    # Fix spelling errors/inconsistency
    .str.replace(r'form', 'from')
    .str.replace('potass?(ium)?', 'potassium', regex=True)
    .str.replace('cholest(erol)?', 'cholesterol', regex=True)
    .str.replace(r'sat(urated)?\s*fat', 'satured fat', regex=True)
    .str.replace(r'trans ?fat', 'trans fat', regex=True)
    .str.replace(r'dietary ?fiber', 'dietary fiber', regex=True)
    .str.replace(r'added ?sugars', 'dietary fiber', regex=True)
    .str.replace(r'\W+or\W+', ',', regex=True)
    .str.replace(r'total ?sugars', 'total sugars', regex=True)
    # Fix vitamin entries (sometimes have only the letter, abbreviated forms, missing spaces, etc)
    .str.replace(r'vit(amin)?\s?(\w)', r'vitamin \2', regex=True)
).str.split(',')

In [95]:
# Get more common insignificant sources
insignificant_sources_freq = insignificant_processed.explode().value_counts().head(20)
insignificant_sources_freq

not_a_significant_source_of
dietary fiber        64766
iron                 56796
calcium              55536
cholesterol          50578
trans fat            49151
satured fat          43524
vitamin d            41715
potassium            31519
vitamin a            19436
vitamin c            17629
total sugars         10602
calories from fat     8254
sugars                6162
fiber                 4684
other nutrients       1829
protein                867
fat cal                828
sugar                  505
c                      350
                       263
Name: count, dtype: int64

We see that while there are many with some errors (the c is alone sometimes due to writing conventions, ex. vitamin a, c, d), we have captured most insignificant sources from the foods. We will simply consider any insignificant nutrients that occur more than 1000 times as "important" in order to one hot encode these.

In [96]:
insignificant_sources = insignificant_sources_freq[insignificant_sources_freq > 1000].index
insignificant_sources

Index(['dietary fiber', 'iron', 'calcium', 'cholesterol', 'trans fat',
       'satured fat', 'vitamin d', 'potassium', 'vitamin a', 'vitamin c',
       'total sugars', 'calories from fat', 'sugars', 'fiber',
       'other nutrients'],
      dtype='object', name='not_a_significant_source_of')

In [97]:
# Populate dictionary for inserting into dataset
new_source_columns = dict()
for source in insignificant_sources:
    # Create series telling whether the food lists nutrient or not
    source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
    new_source_columns[source] = source_series
    
new_source_columns['dietary fiber'].sum()

53631

In [98]:
# Populate the branded_food dataframe
for source, series in new_source_columns.items():
    # Format source name to be underlined and not spaced
    source = 'insig_' + source.replace(' ', '_')
    # Populate dataset
    branded_food[source] = series

In [99]:
branded_food.head()

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,...,insig_satured_fat,insig_vitamin_d,insig_potassium,insig_vitamin_a,insig_vitamin_c,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,...,0,0,0,0,0,0,0,0,0,0
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,...,0,0,0,0,0,0,0,0,0,0
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,...,0,0,0,0,0,0,0,0,0,0
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,...,0,0,0,0,0,0,0,0,0,0
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,...,0,0,0,0,0,0,0,0,0,0


In [100]:
branded_food.memory_usage(deep=True)['insig_iron'] * len(new_source_columns) # New memory usage

27679455

In [101]:
branded_food.memory_usage(deep=True)['not_a_significant_source_of'] # Old memory usage

68469344

From this, we have seen that we have reduced the memory of the insignificant sources by half, and obtained new data that can be analyzed as well. 

In [102]:
branded_food.dtypes

fdc_id                                  int64
brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                           int8
GDSN                                     bool
LI                                       bool
NZGDSN                                   bool
insig_dietary_fiber                      int8
insig_iron                               int8
insig_calcium                     

Many other objects, however, cannot exactly be further condensed.

In [103]:
def fraction_to_number(frac):
    """ Convert fraction to number. """
    assert '/' in frac
    
    num, denom = frac.split('/')
    
    num = '1' if len(num) == 0 else num
    denom = '1' if len(denom) == 0 else denom
    return float(num) / float(denom)

In [104]:
def to_decimal(num_str):
    """ Converts num_str to float. """
    
    # Return if null
    if pd.isna(num_str): return num_str
    
    # Attempt to return anything that is simply one number
    try:
        if '/' not in num_str: return float(num_str)
    except:
        # There are some entries that have the format "\d \d" that should be divided
        #print(num_str)
        pass
    
    num_str = num_str.strip()
    # Special case for if there is a number followed by a fraction 
    # (can also be two numbers, which is assumed to be missing a fraction)
    if ' ' in num_str:
        groupings = num_str.split(' ')
        #print(num_str)
        if '/' in num_str:
            total = float(groupings[0]) + fraction_to_number(groupings[1])
        else:
            total = float(groupings[0]) / float(groupings[1])
    else:
        total = fraction_to_number(num_str)
    
    return total

In [105]:
household_servings = branded_food.household_serving_fulltext.str.lower().str.extract(r'(\d* ?\d+\.?/?\d*?) ([\w*\s*]+)')
household_servings.columns = ('household_serving_amount', 'household_serving_unit')
household_servings['household_serving_converted'] = household_servings['household_serving_amount'].apply(to_decimal)
household_servings

Unnamed: 0,household_serving_amount,household_serving_unit,household_serving_converted
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
1845292,,,
1845293,8,fl oz,8.0
1845294,1,cup,1.0
1845295,1,pc,1.0


In [106]:
household_servings.household_serving_unit.value_counts().head(10)

household_serving_unit
cup        161296
onz        102813
tbsp        84379
oza         39133
pieces      29061
tsp         24550
bar         20507
grm         15075
package     13446
slice       12182
Name: count, dtype: int64

One thing to note is that many of these are very unique and applicable to their respective food (eg. crackers), which is simply a representation of a piece. We have the choice of correcting some of these to simply a "piece" or "unit", but it is hard to do so with there being many different options, so we will simply leave the unit alone. In addition, we may be able to identify more specific foods with the serving unit.

In [107]:
branded_food[['household_serving_amount', 'household_serving_unit']] = household_servings[['household_serving_converted', 'household_serving_unit']]
branded_food[['household_serving_amount', 'household_serving_unit']].memory_usage(deep=True)

Index                            128
household_serving_amount    14762376
household_serving_unit      81177930
dtype: int64

In [108]:
branded_food.household_serving_fulltext.memory_usage(deep=True)

84118746

In [109]:
branded_food.dtypes

fdc_id                                  int64
brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                           int8
GDSN                                     bool
LI                                       bool
NZGDSN                                   bool
insig_dietary_fiber                      int8
insig_iron                               int8
insig_calcium                     

While we have almost all columns processed, we can finally drop all columns we have converted, including:
 - not_a_significant_source_of
 - household_serving_fulltext
 - data_source
 
We are also now able to analyze these previously unanalyzable data sources as well.

In [110]:
branded_food = branded_food.drop(columns=['not_a_significant_source_of', 'household_serving_fulltext', 'data_source'])

In [111]:
branded_food.dtypes

fdc_id                               int64
brand_owner                         object
brand_name                          object
subbrand_name                       object
gtin_upc                            object
ingredients                         object
serving_size                       float64
serving_size_unit                   object
branded_food_category               object
package_weight                      object
modified_date               datetime64[ns]
available_date              datetime64[ns]
market_country                        int8
GDSN                                  bool
LI                                    bool
NZGDSN                                bool
insig_dietary_fiber                   int8
insig_iron                            int8
insig_calcium                         int8
insig_cholesterol                     int8
insig_trans_fat                       int8
insig_satured_fat                     int8
insig_vitamin_d                       int8
insig_potas

In [112]:
branded_food.memory_usage(deep=True).sum() # Data is almost reduced by about .75GB

1585890875

In [113]:
categories = branded_food.branded_food_category.value_counts()
categories.head()

branded_food_category
Popcorn, Peanuts, Seeds & Related Snacks    87691
Candy                                       83993
Cheese                                      76229
Ice Cream & Frozen Yogurt                   58400
Cookies & Biscuits                          50615
Name: count, dtype: int64

We will consider any categories with less than 100 categories as being null/insignificant, as the more rare categories are too specific at times.

In [114]:
categories[categories > 100]

branded_food_category
Popcorn, Peanuts, Seeds & Related Snacks                    87691
Candy                                                       83993
Cheese                                                      76229
Ice Cream & Frozen Yogurt                                   58400
Cookies & Biscuits                                          50615
                                                            ...  
Butter/Butter Substitutes                                     118
Chicken - Prepared/Processed                                  114
Meat/Poultry/Other Animals Sausages – Prepared/Processed      112
Baking                                                        106
Frozen Fish/Seafood                                           104
Name: count, Length: 183, dtype: int64

In [115]:
cat_df = pd.DataFrame(categories[categories>100].index)
cat_df = cat_df.rename(columns={'branded_food_category': 'category'})
cat_df['category_id'] = cat_df.index
cat_df

Unnamed: 0,category,category_id
0,"Popcorn, Peanuts, Seeds & Related Snacks",0
1,Candy,1
2,Cheese,2
3,Ice Cream & Frozen Yogurt,3
4,Cookies & Biscuits,4
...,...,...
178,Butter/Butter Substitutes,178
179,Chicken - Prepared/Processed,179
180,Meat/Poultry/Other Animals Sausages – Prepared...,180
181,Baking,181


In [119]:
branded_food = branded_food.merge(
    cat_df, 
    left_on='branded_food_category', 
    right_on='category').drop(columns=['branded_food_category', 'category'])
branded_food.head()

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,serving_size,serving_size_unit,package_weight,modified_date,...,insig_vitamin_a,insig_vitamin_c,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,15.0,ml,,2020-10-02,...,0,0,0,0,0,0,0,,,164
1,605334,Conagra Brands,,,27000690260,Canola Oil,15.0,ml,,2019-03-25,...,0,0,0,0,0,0,0,1.0,tbsp,164
2,606302,Conagra Brands,,,64144555550,"Canola Oil*, Palm Oil*, Coconut Oil*, Lecithin...",0.25,g,,2019-03-17,...,0,0,0,0,0,0,0,0.25,second spray,164
3,606392,Conagra Brands,,,64144033164,"Canola Oil*, Coconut Oil*, Palm Oil*, Soy Leci...",0.25,g,,2019-03-25,...,0,0,0,0,0,0,0,0.25,second spray,164
4,607224,Conagra Brands,,,64144048502,"Extra Virgin Olive Oil*, Lecithin from Soybean...",0.25,g,,2018-10-21,...,0,0,0,0,0,0,0,0.25,second spray,164


In [120]:
branded_food.memory_usage(deep=True).sum()

1445372304

## Brand Owners normalization

We have many different brand_owners, so we can choose to offload this until we need it.

In [159]:
brand_owners = branded_food.brand_owner.dropna().unique()
brand_owners.shape[0]

36273

In [165]:
brand_owners = pd.DataFrame(data={'brand_owner':brand_owners, 'brand_owner_id':np.arange(brand_owners.shape[0])})
brand_owners

Unnamed: 0,brand_owner,brand_owner_id
0,Richardson Oilseed Products (US) Limited,0
1,Conagra Brands,1
2,Conagra Brands Inc,2
3,"Incobrasa Industries, Ltd.",3
4,CAMPBELL SOUP COMPANY,4
...,...,...
36268,Cape May Foods,36268
36269,Pacific Coral Seafood,36269
36270,Tampa Bay Fisheries Inc,36270
36271,SEALORD GROUP LIMITED,36271


In [166]:
brand_name = pd.DataFrame(data={'brand_name': branded_food.brand_name.dropna().unique()})
brand_name['brand_name_id'] = np.arange(brand_name.shape[0])
brand_name

Unnamed: 0,brand_name,brand_name_id
0,Wesson,0
1,PAM,1
2,Orville Redenbacher's,2
3,Long Life,3
4,SWANSON,4
...,...,...
34226,Figo,34226
34227,Angel Gold,34227
34228,Kho Muc,34228
34229,Deep Cove,34229


In [136]:
branded_food.brand_owner.value_counts()

brand_owner
Wal-Mart Stores, Inc.                 45861
Target Stores                         35542
Topco Associates, Inc.                33654
Safeway, Inc.                         27796
Meijer, Inc.                          26405
                                      ...  
STAR AGRO MARINE EXPORTS PVT. LTD.        1
MRS. FRIDAY'S                             1
ICYBAY                                    1
SCALLOP ST. JACQUES                       1
LEADER PRODUCTS LIMITED                   1
Name: count, Length: 36273, dtype: int64

In [137]:
branded_food.brand_name.value_counts()

brand_name
GREAT VALUE             20505
MEIJER                  18512
WEGMANS                 16943
SIGNATURE SELECT        12378
FOOD CLUB               12213
                        ...  
WHOLESOME FARMS             1
KENNY'S OWN                 1
NUT MEG SPICE               1
CARDAMOM                    1
THE CHOCOLATE CARTEL        1
Name: count, Length: 34231, dtype: int64

In [170]:
# Replace brands with brand id for memory preservation
branded_food_minimal = (
    branded_food
    .merge(brand_owners, left_on='brand_owner', right_on='brand_owner', how='left')
    .merge(brand_name, left_on='brand_name', right_on='brand_name', how='left')
    .drop(columns=['brand_owner', 'brand_name', 'subbrand_name'])
)
branded_food_minimal.head()

Unnamed: 0,fdc_id,gtin_upc,ingredients,serving_size,serving_size_unit,package_weight,modified_date,available_date,market_country,GDSN,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
0,1105904,27000612323,Vegetable Oil,15.0,ml,,2020-10-02,2020-11-13,1,True,...,0,0,0,0,0,,,164,0.0,
1,605334,27000690260,Canola Oil,15.0,ml,,2019-03-25,2019-12-06,1,True,...,0,0,0,0,0,1.0,tbsp,164,1.0,
2,606302,64144555550,"Canola Oil*, Palm Oil*, Coconut Oil*, Lecithin...",0.25,g,,2019-03-17,2019-12-06,1,True,...,0,0,0,0,0,0.25,second spray,164,1.0,
3,606392,64144033164,"Canola Oil*, Coconut Oil*, Palm Oil*, Soy Leci...",0.25,g,,2019-03-25,2019-12-06,1,True,...,0,0,0,0,0,0.25,second spray,164,1.0,
4,607224,64144048502,"Extra Virgin Olive Oil*, Lecithin from Soybean...",0.25,g,,2018-10-21,2019-12-06,1,True,...,0,0,0,0,0,0.25,second spray,164,1.0,


In [171]:
branded_food_minimal.brand_name_id.isna().mean()

0.29290567364810033

In [172]:
branded_food.brand_name.memory_usage(deep=True)

104911443

In [173]:
branded_food.subbrand_name.isna().mean() # Due to large missingness, we will most likely drop this

0.9526953896762737

In [175]:
branded_food_minimal.isna().mean()

fdc_id                      0.000000
gtin_upc                    0.000000
ingredients                 0.002941
serving_size                0.005887
serving_size_unit           0.010390
package_weight              0.615807
modified_date               0.000004
available_date              0.000000
market_country              0.000000
GDSN                        0.000000
LI                          0.000000
NZGDSN                      0.000000
insig_dietary_fiber         0.000000
insig_iron                  0.000000
insig_calcium               0.000000
insig_cholesterol           0.000000
insig_trans_fat             0.000000
insig_satured_fat           0.000000
insig_vitamin_d             0.000000
insig_potassium             0.000000
insig_vitamin_a             0.000000
insig_vitamin_c             0.000000
insig_total_sugars          0.000000
insig_calories_from_fat     0.000000
insig_sugars                0.000000
insig_fiber                 0.000000
insig_other_nutrients       0.000000
h

In [176]:
# Also replace brand name, subbrand is most likely alright

In [177]:
branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

Unnamed: 0,fdc_id,gtin_upc,ingredients,serving_size,serving_size_unit,package_weight,modified_date,available_date,market_country,GDSN,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
455459,1043843,0 77890 32930,"HUMMUS INGREDIENTS: CHICK PEAS, FILTERED WATER...",85.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,85.0,grm,37,6573.0,
1009798,1049145,0 77890 44656,"INGREDIENTS: PURIFIED WATER, ORGANIC LEMON JUI...",429.0,ml,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,14.5,oza,8,6573.0,
863427,1045979,0 77890 47590,"INGREDIENTS: WATER, ORGANIC MUSHROOMS, ORGANIC...",425.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,1.0,can,50,20559.0,
455478,1046795,0 77890 48312,"BROWN RICE FLOUR, POTATO STARCH, SAFFLOWER OIL...",30.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,15.0,crackers,37,6573.0,
455477,1046793,0 77890 48313,"BROWN RICE FLOUR, POTATO STARCH, SAFFLOWER OIL...",30.0,g,,2020-05-21,2020-06-26,1,False,...,0,0,0,0,0,14.0,crackers,37,6573.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691547,1065873,BOOST,"WATER, GLUCOSE SYRUP, SUGAR, MILK PROTEIN CONC...",237.0,ml,,2020-06-16,2020-07-30,1,False,...,0,0,0,0,0,1.0,bottle,51,256.0,
906256,1081623,HAPPYKID ORGANICS,"CULTURED GRADE A ORGANIC MILK, WATER, ORGANIC ...",99.0,g,,2020-07-24,2020-08-27,1,False,...,0,0,0,0,0,1.0,pouch,13,256.0,
334297,1063921,JARLSBERG,"INGREDIENTS: PASTEURIZED PART-SKIM MILK, CULTU...",28.0,g,,2020-06-16,2020-07-30,1,False,...,0,0,0,0,0,1.0,onz,2,9234.0,
1732384,539755,NIELSENUK0002,"CARBONATED WATER, ACIDS (CITRIC ACID, MALIC AC...",100.0,ml,,2017-07-14,2019-04-01,1,False,...,0,0,0,0,0,100.0,mlt,15,15231.0,


In [189]:
def only_true(series):
    return series[series].index

In [194]:
# Many have duplicate entries, possibly either due to updates in ingredients or change in packaging?
branded_food[branded_food.gtin_upc.isin(only_true(branded_food_minimal.gtin_upc.value_counts() > 1))].sort_values('gtin_upc')

Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,serving_size,serving_size_unit,package_weight,modified_date,...,insig_vitamin_a,insig_vitamin_c,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
1710017,948606,WEGMANS,,,0 77890 48687,"INGREDIENTS: CARBONATED WATER, NATURAL FLAVOR.",355.0,ml,,2020-04-24,...,0,0,0,0,0,0,0,12.00,oza,22
1710548,1041279,WEGMANS,,,0 77890 48687,"INGREDIENTS: CARBONATED WATER, NATURAL FLAVOR.",355.0,ml,,2020-05-06,...,0,0,0,0,0,0,0,12.00,oza,22
1045716,947326,WEGMANS,,,0 77890 49092,INGREDIENTS: PASTA (ENRICHED DURUM FLOUR WHEAT...,141.0,g,,2020-04-24,...,0,0,0,0,0,0,0,15.00,ravioli,17
1046329,1040273,WEGMANS,,,0 77890 49092,INGREDIENTS: PASTA (ENRICHED DURUM FLOUR WHEAT...,141.0,g,,2020-05-06,...,0,0,0,0,0,0,0,15.00,ravioli,17
1548733,1181018,Edward Leeds & Company,SUNRIDGE,,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",30.0,g,,2020-06-02,...,0,0,0,0,0,0,0,,,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147590,795669,WHOLE FOODS MARKET,,,999482001585,"TAPIOCA SYRUP, CANE SUGAR, CORN STARCH - MODIF...",30.0,g,,2020-01-28,...,0,0,0,0,0,0,0,6.00,pieces,1
60137,925804,CIRCLE K,,,999995377214,"SUGAR, BUTTER(MILK), WHEAT FLOUR(WHEAT FLOUR, ...",33.0,g,,2020-04-08,...,0,0,0,0,0,0,0,1.16,onz,4
48837,1129231,CIRCLE K,,,999995377214,"SUGAR, BUTTER(MILK), WHEAT FLOUR(WHEAT FLOUR, ...",33.0,g,,2020-04-08,...,0,0,0,0,0,0,0,,,4
1486665,1041929,NOT A BRANDED ITEM,,,Old Country Store,"CURED WITH WATER, SALT, SUGAR, SODIUM PHOSPHAT...",13.0,g,,2020-05-06,...,0,0,0,0,0,0,0,1.00,fried **,71


In [181]:
branded_food_minimal.gtin_upc.value_counts()

gtin_upc
00014100045526    22
10038034559208    22
00051000212245    22
00051000105462    22
10038034557709    22
                  ..
024682009960       1
024682060152       1
027800072433       1
8158000020383      1
9420064501823      1
Name: count, Length: 428069, dtype: int64

In [195]:
branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

Unnamed: 0,fdc_id,gtin_upc,ingredients,serving_size,serving_size_unit,package_weight,modified_date,available_date,market_country,GDSN,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
455459,1043843,0 77890 32930,"HUMMUS INGREDIENTS: CHICK PEAS, FILTERED WATER...",85.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,85.0,grm,37,6573.0,
1009798,1049145,0 77890 44656,"INGREDIENTS: PURIFIED WATER, ORGANIC LEMON JUI...",429.0,ml,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,14.5,oza,8,6573.0,
863427,1045979,0 77890 47590,"INGREDIENTS: WATER, ORGANIC MUSHROOMS, ORGANIC...",425.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,1.0,can,50,20559.0,
455478,1046795,0 77890 48312,"BROWN RICE FLOUR, POTATO STARCH, SAFFLOWER OIL...",30.0,g,,2020-05-22,2020-06-26,1,False,...,0,0,0,0,0,15.0,crackers,37,6573.0,
455477,1046793,0 77890 48313,"BROWN RICE FLOUR, POTATO STARCH, SAFFLOWER OIL...",30.0,g,,2020-05-21,2020-06-26,1,False,...,0,0,0,0,0,14.0,crackers,37,6573.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691547,1065873,BOOST,"WATER, GLUCOSE SYRUP, SUGAR, MILK PROTEIN CONC...",237.0,ml,,2020-06-16,2020-07-30,1,False,...,0,0,0,0,0,1.0,bottle,51,256.0,
906256,1081623,HAPPYKID ORGANICS,"CULTURED GRADE A ORGANIC MILK, WATER, ORGANIC ...",99.0,g,,2020-07-24,2020-08-27,1,False,...,0,0,0,0,0,1.0,pouch,13,256.0,
334297,1063921,JARLSBERG,"INGREDIENTS: PASTEURIZED PART-SKIM MILK, CULTU...",28.0,g,,2020-06-16,2020-07-30,1,False,...,0,0,0,0,0,1.0,onz,2,9234.0,
1732384,539755,NIELSENUK0002,"CARBONATED WATER, ACIDS (CITRIC ACID, MALIC AC...",100.0,ml,,2017-07-14,2019-04-01,1,False,...,0,0,0,0,0,100.0,mlt,15,15231.0,


In [196]:
branded_food_minimal.memory_usage(deep=True).sum() # About 862 megabytes

1166926480

In [197]:
cat_df.memory_usage(deep=True).sum()

16700

In [222]:
ingredients = (
    branded_food.ingredients.str.lower()
    .str.replace('\*', '', regex=True)
    .str.replace('ingredients: ?', '', regex=True)
    .str.replace('\s*,\s*', ',', regex=True)
    .str.split(',')
    #.explode()
)

In [223]:
ingredients_df = pd.DataFrame(ingredients.explode().unique(), columns=['ingredient'])
ingredients_df['ingredientId'] = ingredients_df.index
# As we can see, there are some errors that are hard to fix, such as when there is a comma meant to repeat ingredient versions
ingredients_df

Unnamed: 0,ingredient,ingredientId
0,vegetable oil,0
1,canola oil,1
2,palm oil,2
3,coconut oil,3
4,lecithin from soybeans (non-stick agent),4
...,...,...
378672,bread improver (anti caking agent),378672
378673,enhancers (635,378673
378674,920),378674
378675,shrimp powder (crustacean)),378675


In [228]:
# Way to map stuff, should not be fully joined due to how large it is
food_to_ingredient_df = (
    ingredients_df.merge(
        ingredient.explode()
        .reset_index(),
        left_on='ingredient', 
        right_on='ingredients',
        how='left')
    .drop(columns=['ingredient', 'ingredients'])
)

Finally, we have done multiple normalizations and reductions in memory, and are now ready to write it to disk. 

In [None]:
branded_food.to_csv('data/branded_food_reduced.csv')

In [None]:
cat_df.to_csv('data/branded_food_categories.csv')

In [None]:
brand_owners_df.to_csv('data/brand_owners.csv')

In [None]:
food_to_ingredient_df.to_csv('data/food_to_ingredient.csv')

In [None]:
ingredients_df.to_csv('data/ingredients.csv')

## Webapp specific changes

In order to make the dataset as small as possible, we will drop additional columns that most likely cannot be fully made use of.

In [None]:
branded_food.drop(columns='ingredients')