# Branded food data frame analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy.stats import ks_2samp
import nltk
import regex

In [2]:
filepath = lambda x: os.path.join('data', x)

In [3]:
branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={
    'brand_owner':str,
    'brand_name':str,
    'subbrand_name':str,
    'gtin_upc':str})
branded_food.head()

  branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={


Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,,,,
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,,,,
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,,,,


Because fdc_id is unique, we will set that as our index:

In [4]:
branded_food.fdc_id = branded_food.fdc_id.astype(np.int32)
branded_food = branded_food.set_index('fdc_id')

In [5]:
branded_food.memory_usage(deep=True).sum() / 1e9 # Total gigabyte size

2.259462907

Going through the initial inspection of the dataset:

- Missingness/null values (done)
- incorrect dtypes (done)
- Reducing of columns/memory if possible (done)
- Validation of data (outlier searching and correction of incorrect values)

## Initial Missingness check

In [6]:
branded_food.isnull().mean()

brand_owner                    0.007813
brand_name                     0.296762
subbrand_name                  0.952827
gtin_upc                       0.000000
ingredients                    0.002923
not_a_significant_source_of    0.960171
serving_size                   0.005842
serving_size_unit              0.010312
household_serving_fulltext     0.585086
branded_food_category          0.005729
data_source                    0.000000
package_weight                 0.617388
modified_date                  0.000011
available_date                 0.000000
market_country                 0.000000
discontinued_date              1.000000
preparation_state_code         0.978783
trade_channel                  0.991557
short_description              0.978720
dtype: float64

We see that there are some columns with missing data, mainly towards the end, including discontinued_date, preparation_state_code, trade_channel, and short_description, which we will take a look through non-null examples and drop if necessary to preserve memory.

In [7]:
has_prep_code = branded_food[branded_food.preparation_state_code.notnull()]
has_prep_code.head()[['brand_owner', 'brand_name', 'preparation_state_code']]

Unnamed: 0_level_0,brand_owner,brand_name,preparation_state_code
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2219410,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,UNPREPARED
2219411,Cargill Incorporated/Honeysuckle White,Honeysuckle White,UNPREPARED
2219412,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,UNPREPARED
2219413,Kellogg Company US,Kellogg's Pop-Tarts,UNPREPARED
2219414,Kellogg Company US,Kellogg's Cheez It,UNPREPARED


In [8]:
branded_food.preparation_state_code.value_counts(dropna=False)

NaN               1806146
UNPREPARED          26631
PREPARED             5676
READY_TO_EAT         2830
READY_TO_DRINK       2677
BAKE                  603
HEAT_AND_SERVE        272
THAW                  147
FREEZE                 97
GRILL                  95
CONVECTION             36
UNSPECIFIED            22
FRY                    15
STEAM                  12
DEEP_FRY               12
ROAST                  11
BOIL                    7
MICROWAVE               4
STIR_FRY                4
Name: preparation_state_code, dtype: int64

We see that many of these are with respect to how to prepare the food for consumption, many of which are simply labeled "unprepared" or prepared. We also see that many of these preparation codes are disambiguous, as unprepared does contain also foods that would be considered ready to eat, such as Pop-Tarts or Cheez-its. Because of this, it may be hard to categorize food into explicitly "prepared" and "unprepared" categories. We do see that this column is fairly well defined, as there are not many values.

In [9]:
branded_food[branded_food.short_description.notna()].short_description # Not too useful, does not have a standard nature

fdc_id
2219410          HSW Fh 93% Grd Tky Chub 12/1
2219411        HSW Fh Gr WhtDry Ex Wt 6/1.25#
2219412      HSW Fh 85/15 Gr tky Ex Wt 6/1.25
2219413                             Pop-Tarts
2219414                        Gripz Crackers
                          ...                
2551347    Strawberry, Sliced, Frozen 30 lbs.
2551348                 500LB MM VANILLA 1 CS
2551349     Biscuits, Btrmlk, Sl, 3", Premium
2551350     CKIES,ORG GINGER SNAP,MINI,GF,G&G
2551351                               Poultry
Name: short_description, Length: 39267, dtype: object

In [10]:
branded_food = branded_food.drop(columns=['discontinued_date', 'trade_channel', 'short_description'])

## GTIN UPC cleaning 

GTIN UPC is a common format/code used in many different PoS stores, which is extended to any item that has a barcode associated with it. On further looking at the data, we had to specify that the upc is stored as a string, as there are many examples where there are leading 0's as shown. Further looking shows that there are sometimes dashes or spaces, which we will remove to make the string more standardized.

In [11]:
branded_food.gtin_upc = branded_food.gtin_upc.str.replace('-', '').str.replace(' ', '')

In [12]:
branded_food[~branded_food.gtin_upc.str.isdigit()].gtin_upc

fdc_id
539755        NIELSENUK0002
573164         84279810254X
949336      OldCountryStore
950368       6005207001298>
1041929     OldCountryStore
1042843      6005207001298>
1044611        01541801360X
1048721       028000428501`
1063921           JARLSBERG
1065873               BOOST
1081623    HAPPYKIDORGANICS
1082317        0441154.3042
Name: gtin_upc, dtype: object

There are also some ids that simply do not correct gtin_upcs. Given how few there are that follow this irregularity, we may be able to simply drop these, although they probably do not need to be dropped

## Serving sizes

We see that there are several options. We will look up what these units exactly mean (unabbreviated), and possibly merge any containing the same amount. We can also visualize the distributions of respective foods.

- g (gram)
- ml (mililiter, most likely for fluids)
- grm - unknown, will compare to gram's distribution for differences
- mlt - most likely another mililiter, but these have different distributions
- MG - possibly a milligram distribution
- IU - international units, similar to mililiters
- GM - possible alias for gram as well
- MC - unknown currently

In [13]:
branded_food.serving_size_unit.value_counts()

g      1522350
ml      243399
GRM      40597
MLT       8288
MG        7555
IU        3671
GM         346
MC          63
Name: serving_size_unit, dtype: int64

If we only look at these examples, we would consider them as conventionally the same. However, they may not be the same exactly. One way we can possibly further reinforce this idea is through a permutation test; if we assume that they're from the same distribution, then the two distributions should be similar to each other. We can do this through the scipy ks_2samp test function, which determines whether the distributions are different enough:

## Gram permutation test

Because the amount of foods there are between the two, we will instead choose to sample the distributions to about 10,000 for each distribution. We will also eliminate some of the outliers, as we noticed from the descriptive statistics (mainly the max) of g foods.

In [14]:
# Perform permutation for all serving_size units based on serving size measured
test_stat = lambda x, y: ks_2samp(x, y, alternative='two-sided').pvalue # permutation test
get_serving_sizes = lambda x: branded_food[branded_food.serving_size_unit == x].serving_size
serving_units = branded_food.serving_size_unit.dropna().unique()

same_dist = []
for u_1 in serving_units:
    u_1_dist = get_serving_sizes(u_1)
    for u_2 in serving_units:
        if u_1 == u_2: 
            continue # no need
            
        u_2_dist = get_serving_sizes(u_2)
        # Perform ks_2samp test
        p_val = test_stat(
            u_1_dist,
            u_2_dist
        )
        
        if p_val > .05:
            print(u_1, u_2)
            same_dist.append((u1, u2))

In [15]:
same_dist

[]

We see that all units are completely different distributions from each other, including grm and g, as well as ml and mlt, which we may have expected to have the same distribution.

## Further memory reducing

If we want to be able to reduce the amount of memory required to hold the entire data frame. We will do this be attempting to reduce redundant information even further.

In [16]:
branded_food.memory_usage(deep=True)

Index                           14762376
brand_owner                    142327530
brand_name                     105382109
subbrand_name                   62118864
gtin_upc                       127462998
ingredients                    623353099
not_a_significant_source_of     68469344
serving_size                    14762376
serving_size_unit              106885302
household_serving_fulltext      84136047
branded_food_category          143595828
data_source                    109068029
package_weight                  84714471
modified_date                  123634199
available_date                 123634899
market_country                 129168556
preparation_state_code          60420254
dtype: int64

In [17]:
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,preparation_state_code
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,
1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,
1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,
1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,
1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,


We can look at many string based columns to see if there is a better way to possibly reduce memory, as strings are usually unique, immutable, and take a lot of memory. 

In [18]:
branded_food.market_country.value_counts(dropna=False)

United States    1844180
New Zealand         1117
Name: market_country, dtype: int64

The market country column shows that many entries repeat United States, when a better approach may just be to one-hot encode whether the food is from the us or not. Since there are only two options, we can mark whether the food is for the american market with a 1, or for the new zealand market with a 0. This reduces memory usage from over 120 million bytes to simply 1 million bytes!

In [19]:
branded_food.market_country = (branded_food.market_country == 'United States').astype(np.int8)

In [20]:
branded_food.market_country.memory_usage(deep=True)

16607673

Other memory optimizations is the conversion of the modified_date and available_date columns to datetime, which converts the issue of strings to simply individual objects, which reduces the bytes used tremendously as well; the byte count shrinks about 10x for both columns.

In [21]:
branded_food.modified_date.memory_usage(deep=True) / 1e9 # Before datetime

0.138396575

In [22]:
branded_food.modified_date = pd.to_datetime(branded_food.modified_date)
branded_food.available_date = pd.to_datetime(branded_food.available_date)

In [23]:
branded_food.modified_date.memory_usage(deep=True) / 1e9 # After datetime

0.029524752

In [24]:
branded_food.dtypes

brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                           int8
preparation_state_code                 object
dtype: object

In [25]:
branded_food.data_source.memory_usage(deep=True)

123830405

In [26]:
branded_food.data_source.value_counts() # Easily one-hot encodeable into possibly three different categories

LI        1748661
GDSN        95519
NZGDSN       1117
Name: data_source, dtype: int64

In [27]:
pd.get_dummies(branded_food.data_source) # One hot encodes easily

Unnamed: 0_level_0,GDSN,LI,NZGDSN
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1105904,1,0,0
1105905,1,0,0
1105906,1,0,0
1105907,1,0,0
1105908,1,0,0
...,...,...,...
2554910,0,1,0
2554911,0,1,0
2554912,0,1,0
2554913,0,1,0


In [28]:
branded_food = pd.concat([branded_food, pd.get_dummies(branded_food.data_source)], axis=1) # One hot encodes the data source column
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,preparation_state_code,GDSN,LI,NZGDSN
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,1,,1,0,0
1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,1,,1,0,0
1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,1,,1,0,0
1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,1,,1,0,0
1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,1,,1,0,0


## Insignificant sources
There is a lot of redundancy in this column, as many repeat the same insignificant sources, as many of them are simply nutritional value-related.

In [29]:
branded_food.not_a_significant_source_of.notna().mean()

0.039828818883897825

In [30]:
# Get process insignificant sources (a lot more processing for a more standard convention)
insignificant_processed = (
    branded_food.not_a_significant_source_of
    .str.lower()
    # Cut off repetition (not a significant source)
    .str.replace(r'^not ?a? ?significant ?source ?of:? ?', '', regex=True)
    # Fix comma spacing
    .str.replace('&', 'and', regex=False)
    .str.replace(',? ?and,?', ',', regex=True)
    .str.replace('\s*,\s*', ',', regex=True)
    # Remove any periods
    .str.replace('.', '', regex=False)
    # Fix spelling errors/inconsistency
    .str.replace(r'form', 'from')
    .str.replace('potass?(ium)?', 'potassium', regex=True)
    .str.replace('cholest(erol)?', 'cholesterol', regex=True)
    .str.replace(r'sat(urated)?\s*fat', 'satured fat', regex=True)
    .str.replace(r'trans ?fat', 'trans fat', regex=True)
    .str.replace(r'dietary ?fiber', 'dietary fiber', regex=True)
    .str.replace(r'\W+or\W+', ',', regex=True)
    .str.replace(r'total ?sugars', 'total sugars', regex=True)
    # Fix vitamin entries (sometimes have only the letter, abbreviated forms, missing spaces, etc)
    .str.replace(r'vit(amin)?\s?(\w)', r'vitamin \2', regex=True)
)#.str.split(',')

In [31]:
# Get more common insignificant sources
insignificant_sources_freq = insignificant_processed.str.split(',').explode().value_counts()
insignificant_sources_freq.head(20)

iron                 57121
calcium              55694
cholesterol          50576
dietary fiber        49898
trans fat            49215
satured fat          43592
vitamin d            41752
potassium            31664
vitamin a            19474
vitamin c            17712
added sugars         13770
total sugars         10602
calories from fat     8198
sugars                6169
fiber                 4686
other nutrients       1820
addedsugars            964
protein                944
fat cal                830
sugar                  505
Name: not_a_significant_source_of, dtype: int64

We see that while there are many with some errors (the c is alone sometimes due to writing conventions, ex. vitamin a, c, d), we have captured most insignificant sources from the foods. We will simply consider any insignificant nutrients that occur more than 1000 times as "important" in order to one hot encode these.

In [32]:
insignificant_sources = insignificant_sources_freq[insignificant_sources_freq > 1000].index
insignificant_sources

Index(['iron', 'calcium', 'cholesterol', 'dietary fiber', 'trans fat',
       'satured fat', 'vitamin d', 'potassium', 'vitamin a', 'vitamin c',
       'added sugars', 'total sugars', 'calories from fat', 'sugars', 'fiber',
       'other nutrients'],
      dtype='object')

In [33]:
insignificant_processed[insignificant_processed.notna()].str.contains('dietary fiber')

fdc_id
1849687     True
1849735     True
1849762     True
1849763     True
1849769    False
           ...  
2554894     True
2554895     True
2554903    False
2554907    False
2554909    False
Name: not_a_significant_source_of, Length: 73496, dtype: bool

In [34]:
# Populate dictionary for inserting into dataset
new_source_columns = dict()
for source in insignificant_sources:
    # Create series telling whether the food lists nutrient or not
    source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
    new_source_columns[source] = source_series
    
new_source_columns['dietary fiber'].sum()

50688

In [35]:
# Populate the branded_food dataframe
for source, series in new_source_columns.items():
    # Format source name to be underlined and not spaced
    source = 'insig_' + source.replace(' ', '_')
    # Populate dataset
    branded_food[source] = series

In [36]:
branded_food.memory_usage(deep=True)['insig_iron'] * len(new_source_columns) # New memory usage

29524752

In [37]:
branded_food.memory_usage(deep=True)['not_a_significant_source_of'] # Old memory usage

68469344

From this, we have seen that we have reduced the memory of the insignificant sources by half, and obtained new data that can be looked into further.

In [38]:
branded_food.dtypes

brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                           int8
preparation_state_code                 object
GDSN                                    uint8
LI                                      uint8
NZGDSN                                  uint8
insig_iron                               int8
insig_calcium                            int8
insig_cholesterol                 

Many other objects, however, cannot exactly be further condensed.

In [39]:
def fraction_to_number(frac):
    """ Convert fraction to number. """
    assert '/' in frac
    
    num, denom = frac.split('/')
    
    num = '1' if len(num) == 0 else num
    denom = '1' if len(denom) == 0 else denom
    return float(num) / float(denom)

In [40]:
def to_decimal(num_str):
    """ Converts num_str to float. """
    
    # Return if null
    if pd.isna(num_str): return num_str
    
    # Attempt to return anything that is simply one number
    try:
        if '/' not in num_str: return float(num_str)
    except:
        # There are some entries that have the format "\d \d" that should be divided
        #print(num_str)
        pass
    
    num_str = num_str.strip()
    # Special case for if there is a number followed by a fraction 
    # (can also be two numbers, which is assumed to be missing a fraction)
    if ' ' in num_str:
        groupings = num_str.split(' ')
        #print(num_str)
        if '/' in num_str:
            total = float(groupings[0]) + fraction_to_number(groupings[1])
        else:
            total = float(groupings[0]) / float(groupings[1])
    else:
        total = fraction_to_number(num_str)
    
    return total

In [41]:
household_servings = branded_food.household_serving_fulltext.str.lower().str.extract(r'(\d* ?\d+\.?/?\d*?) ([\w*\s*]+)')
household_servings.columns = ('household_serving_amount', 'household_serving_unit')
household_servings['household_serving_converted'] = household_servings['household_serving_amount'].apply(to_decimal)
household_servings

Unnamed: 0_level_0,household_serving_amount,household_serving_unit,household_serving_converted
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1105904,,,
1105905,,,
1105906,,,
1105907,,,
1105908,,,
...,...,...,...
2554910,,,
2554911,8,fl oz,8.0
2554912,1,cup,1.0
2554913,1,pc,1.0


In [42]:
household_servings.household_serving_unit.value_counts().head(10)

cup        161296
onz        102813
tbsp        84379
oza         39133
pieces      29061
tsp         24550
bar         20507
grm         15075
package     13446
slice       12182
Name: household_serving_unit, dtype: int64

One thing to note is that many of these are very unique and applicable to their respective food (eg. crackers), which is simply a representation of a piece. We have the choice of correcting some of these to simply a "piece" or "unit", but it is hard to do so with there being many different options, so we will simply leave the unit alone. In addition, we may be able to identify more specific foods with the serving unit.

In [43]:
branded_food[['household_serving_amount', 'household_serving_unit']] = household_servings[['household_serving_converted', 'household_serving_unit']]
branded_food[['household_serving_amount', 'household_serving_unit']].memory_usage(deep=True)

Index                       14762376
household_serving_amount    14762376
household_serving_unit      81177930
dtype: int64

In [44]:
branded_food.household_serving_fulltext.memory_usage(deep=True)

98898423

While we have almost all columns processed, we can finally drop all columns we have converted, including:
 - not_a_significant_source_of
 - household_serving_fulltext
 - data_source
 
We are also now able to analyze these previously unanalyzable data sources as well.

In [45]:
branded_food.memory_usage(deep=True).sum() # Data is almost reduced by about .75GB

1909829625

In [46]:
categories = branded_food.branded_food_category.value_counts()
categories.head()

Popcorn, Peanuts, Seeds & Related Snacks    87691
Candy                                       83993
Cheese                                      76229
Ice Cream & Frozen Yogurt                   58400
Cookies & Biscuits                          50615
Name: branded_food_category, dtype: int64

We will consider any categories with less than 100 categories as being null/insignificant, as the more rare categories are too specific at times.

In [47]:
categories[categories > 100]

Popcorn, Peanuts, Seeds & Related Snacks                    87691
Candy                                                       83993
Cheese                                                      76229
Ice Cream & Frozen Yogurt                                   58400
Cookies & Biscuits                                          50615
                                                            ...  
Butter/Butter Substitutes                                     118
Chicken - Prepared/Processed                                  114
Meat/Poultry/Other Animals Sausages – Prepared/Processed      112
Baking                                                        106
Frozen Fish/Seafood                                           104
Name: branded_food_category, Length: 183, dtype: int64

In [48]:
cat_df = pd.DataFrame(categories[categories>100].index)
cat_df.columns = ['category']
cat_df['category_id'] = cat_df.index
cat_df

Unnamed: 0,category,category_id
0,"Popcorn, Peanuts, Seeds & Related Snacks",0
1,Candy,1
2,Cheese,2
3,Ice Cream & Frozen Yogurt,3
4,Cookies & Biscuits,4
...,...,...
178,Butter/Butter Substitutes,178
179,Chicken - Prepared/Processed,179
180,Meat/Poultry/Other Animals Sausages – Prepared...,180
181,Baking,181


In [49]:
branded_food = branded_food.merge(
    cat_df, 
    left_on='branded_food_category', 
    right_on='category').drop(columns=['branded_food_category', 'category'])
branded_food.head()

Unnamed: 0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
0,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,164
1,Conagra Brands,,,27000690260,Canola Oil,,15.0,ml,1 tbsp (15ml),GDSN,...,0,0,0,0,0,0,0,1.0,tbsp,164
2,Conagra Brands,,,64144555550,"Canola Oil*, Palm Oil*, Coconut Oil*, Lecithin...",,0.25,g,1/4 Second Spray (0.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164
3,Conagra Brands,,,64144033164,"Canola Oil*, Coconut Oil*, Palm Oil*, Soy Leci...",,0.25,g,1/4 second spray (0.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164
4,Conagra Brands,,,64144048502,"Extra Virgin Olive Oil*, Lecithin from Soybean...",,0.25,g,1/4 Second Spray (.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164


In [50]:
branded_food.memory_usage(deep=True).sum() / 1e9

1.766589131

## Brand Owners and Names normalization

We have many different brand_owners, so we can choose to offload this until we need it.

In [111]:
def create_normalized_df(df, col):
    # Creates a new dataframe to be joined 
    # to the original dataframe(normalized)
    new_df = pd.DataFrame(
        data={col:(
            branded_food[col]
              .dropna().unique()
        )}
    )
    new_df[col + '_id'] = new_df.index
    return new_df
    
    
def write_df_to_csv(df, path):
    if os.path.exists(path):
        # Delete file
        os.remove(path)
    with open(path, 'w', encoding='utf-8') as to_write:
        to_write.write(df.to_csv())

In [112]:
brand_owners = create_normalized_df(branded_food, 'brand_owner')
brand_owners

Unnamed: 0,brand_owner,brand_owner_id
0,Richardson Oilseed Products (US) Limited,0
1,Conagra Brands,1
2,Conagra Brands Inc,2
3,"Incobrasa Industries, Ltd.",3
4,CAMPBELL SOUP COMPANY,4
...,...,...
36268,Cape May Foods,36268
36269,Pacific Coral Seafood,36269
36270,Tampa Bay Fisheries Inc,36270
36271,SEALORD GROUP LIMITED,36271


In [113]:
brand_names = create_normalized_df(branded_food, 'brand_name')
brand_names

Unnamed: 0,brand_name,brand_name_id
0,Wesson,0
1,PAM,1
2,Orville Redenbacher's,2
3,Long Life,3
4,SWANSON,4
...,...,...
34226,Figo,34226
34227,Angel Gold,34227
34228,Kho Muc,34228
34229,Deep Cove,34229


In [114]:
# Save brand name and owner for later
clean_dir = 'cleaned'
if not os.path.exists(clean_dir):
    os.mkdir(clean_dir)

write_df_to_csv(brand_owners, os.path.join('cleaned', 'brand_owners.csv'))
write_df_to_csv(brand_names, os.path.join('cleaned', 'brand_names.csv'))
    
del brand_owners
del brand_names

In [78]:
# Create ingredients list for every item
ingredients = (branded_food.ingredients.str.lower()
    .str.replace('[\.\*]', '', regex=True)
    .str.replace(' ?\([^\(\)]*\) ?', '', regex=True) #Removes parenthesis, removes more specific items
    .str.replace('ingredients: ?', '', regex=True)
    .str.replace('contains ?', '', regex=True) 
    .str.replace(' ?(less)? ?(than)?<? ?\d*.?\d*% ?(of)?:?', '', regex=True) # May be used to remove <2% ingredients
    .str.split(' ?, ?', regex=True)
)

In [115]:
# Create table with ingredients
ingredients_df = pd.DataFrame(
    data={
        'ingredient':ingredients.explode().unique()
    }
)
ingredients_df

Unnamed: 0,ingredient
0,vegetable oil
1,canola oil
2,palm oil
3,coconut oil
4,lecithin from soybeans
...,...
230659,herb)
230660,new zealand fish
230661,seasoning (soy
230662,enhancers


In [91]:
# Create a normalization table indicating what ingredients are in each food
food_ingredients = (
    ingredients
    .explode()
    .reset_index()
    .rename(columns={'index':'fdc_id', 'ingredients':'ingredient'})
    .merge(ingredients_df)
    .rename(columns={'ingredientId':'ingredient_id'})
    .drop(columns=['ingredient'])
)
food_ingredients

Unnamed: 0,fdc_id,ingredient_id
0,0,0
1,11,0
2,12,0
3,13,0
4,15,0
...,...,...
18684478,1830903,255184
18684479,1830905,255185
18684480,1830906,255188
18684481,1830908,255189


In [117]:
write_df_to_csv(food_ingredients, os.path.join('cleaned', 'food_to_id.csv'))
del food_ingredients

In [116]:
branded_food

Unnamed: 0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
0,Richardson Oilseed Products (US) Limited,,,00027000612323,Vegetable Oil,,15.00,ml,,GDSN,...,0,0,0,0,0,0,0,,,164
1,Conagra Brands,,,00027000690260,Canola Oil,,15.00,ml,1 tbsp (15ml),GDSN,...,0,0,0,0,0,0,0,1.00,tbsp,164
2,Conagra Brands,,,00064144555550,"Canola Oil*, Palm Oil*, Coconut Oil*, Lecithin...",,0.25,g,1/4 Second Spray (0.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164
3,Conagra Brands,,,00064144033164,"Canola Oil*, Coconut Oil*, Palm Oil*, Soy Leci...",,0.25,g,1/4 second spray (0.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164
4,Conagra Brands,,,00064144048502,"Extra Virgin Olive Oil*, Lecithin from Soybean...",,0.25,g,1/4 Second Spray (.25g),GDSN,...,0,0,0,0,0,0,0,0.25,second spray,164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1831233,Brakebush Brothers,Brakebush,,10038034520604,INGREDIENTS: SKINLESS BONELESS CHICKEN BREAST ...,,113.00,GRM,4 Ounce,GDSN,...,0,0,0,0,0,0,0,4.00,ounce,179
1831234,Brakebush Brothers,Brakebush,,10038034556702,UNCOOKED Boneless Chicken Breast TENDER FRITTE...,,76.00,GRM,1 Piece,GDSN,...,0,0,0,0,0,0,0,1.00,piece,179
1831235,Brakebush Brothers,Brakebush,,10038034460108,INGREDIENTS: CHICKEN 1ST AND 2ND WING PORTIONS...,,117.00,GRM,3 Piece,GDSN,...,0,0,0,0,0,0,0,3.00,piece,179
1831236,Brakebush Brothers,"Brakebush Brothers, Inc.",,10038034522301,INGREDIENTS: SKINLESS BONELESS CHICKEN BREAST ...,,85.00,GRM,3 ounces,GDSN,...,0,0,0,0,0,0,0,3.00,ounces,179


In [59]:
# Replace brands with brand id for memory preservation
branded_food_minimal = (
    branded_food
    .merge(brand_owners, left_on='brand_owner', right_on='brand_owner', how='left')
    .merge(brand_name, left_on='brand_name', right_on='brand_name', how='left')
    .drop(columns=['brand_owner', 'brand_name', 'subbrand_name'])
)
branded_food_minimal.head()

Unnamed: 0,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,package_weight,modified_date,available_date,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
0,27000612323,Vegetable Oil,,15.0,ml,,GDSN,,2020-10-02,2020-11-13,...,0,0,0,0,0,,,164,0.0,
1,27000690260,Canola Oil,,15.0,ml,1 tbsp (15ml),GDSN,,2019-03-25,2019-12-06,...,0,0,0,0,0,1.0,tbsp,164,1.0,
2,64144555550,"Canola Oil*, Palm Oil*, Coconut Oil*, Lecithin...",,0.25,g,1/4 Second Spray (0.25g),GDSN,,2019-03-17,2019-12-06,...,0,0,0,0,0,0.25,second spray,164,1.0,
3,64144033164,"Canola Oil*, Coconut Oil*, Palm Oil*, Soy Leci...",,0.25,g,1/4 second spray (0.25g),GDSN,,2019-03-25,2019-12-06,...,0,0,0,0,0,0.25,second spray,164,1.0,
4,64144048502,"Extra Virgin Olive Oil*, Lecithin from Soybean...",,0.25,g,1/4 Second Spray (.25g),GDSN,,2018-10-21,2019-12-06,...,0,0,0,0,0,0.25,second spray,164,1.0,


In [60]:
branded_food_minimal.brand_name_id.isna().mean()

0.5098430679136191

In [61]:
branded_food.brand_name.memory_usage(deep=True)

119483176

In [62]:
branded_food_minimal.isna().mean()

gtin_upc                       0.000000
ingredients                    0.002941
not_a_significant_source_of    0.959907
serving_size                   0.005887
serving_size_unit              0.010390
household_serving_fulltext     0.588292
data_source                    0.000000
package_weight                 0.615807
modified_date                  0.000004
available_date                 0.000000
market_country                 0.000000
preparation_state_code         0.979301
GDSN                           0.000000
LI                             0.000000
NZGDSN                         0.000000
insig_iron                     0.000000
insig_calcium                  0.000000
insig_cholesterol              0.000000
insig_dietary_fiber            0.000000
insig_trans_fat                0.000000
insig_satured_fat              0.000000
insig_vitamin_d                0.000000
insig_potassium                0.000000
insig_vitamin_a                0.000000
insig_vitamin_c                0.000000


In [63]:
# Also replace brand name, subbrand is most likely alright
branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

Unnamed: 0,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,package_weight,modified_date,available_date,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
272486,000000000000,"ORGANIC BLUE CORN, ORGANIC SUNFLOWER OIL, SEA ...",,28.0,g,1 ONZ,LI,,2018-04-16,2019-04-01,...,0,0,0,0,0,1.0,onz,5,7538.0,
1561786,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",,30.0,g,,LI,25 lbs,2020-06-02,2021-10-28,...,0,0,0,0,0,,,16,4300.0,
175575,000000018265,"YOGURT RAISINS, TAMARI ROASTED ALMONDS, ORGANI...",,40.0,g,,LI,,2021-03-11,2021-10-28,...,0,0,0,0,0,,,0,4300.0,
844346,000000018319,"ORGANIC ROLLED OATS, ORGANIC EVAPORATED CANE J...",,55.0,g,,LI,,2021-03-11,2021-10-28,...,0,0,0,0,0,,,12,4300.0,
175577,000000018340,"DRY ROASTED ALMONDS, HATCH GREEN CHILE SEASONI...",,30.0,g,,LI,,2020-06-03,2021-10-28,...,0,0,0,0,0,,,0,4300.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1691547,BOOST,"WATER, GLUCOSE SYRUP, SUGAR, MILK PROTEIN CONC...",,237.0,ml,1 bottle,LI,,2020-06-16,2020-07-30,...,0,0,0,0,0,1.0,bottle,51,256.0,
906256,HAPPYKIDORGANICS,"CULTURED GRADE A ORGANIC MILK, WATER, ORGANIC ...",,99.0,g,1 pouch,LI,,2020-07-24,2020-08-27,...,0,0,0,0,0,1.0,pouch,13,256.0,
334297,JARLSBERG,"INGREDIENTS: PASTEURIZED PART-SKIM MILK, CULTU...",,28.0,g,1 ONZ,LI,,2020-06-16,2020-07-30,...,0,0,0,0,0,1.0,onz,2,9234.0,
1732384,NIELSENUK0002,"CARBONATED WATER, ACIDS (CITRIC ACID, MALIC AC...",,100.0,ml,100 MLT,LI,,2017-07-14,2019-04-01,...,0,0,0,0,0,100.0,mlt,15,15231.0,


In [65]:
# Many have duplicate entries, possibly either due to updates in ingredients or change in packaging?
branded_food[branded_food.gtin_upc.isin(only_true(branded_food_minimal.gtin_upc.value_counts() > 1))].sort_values('gtin_upc')

Unnamed: 0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
1561786,Edward Leeds & Company,SUNRIDGE,,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",,30.0,g,,LI,...,0,0,0,0,0,0,0,,,16
1553484,Edward Leeds & Company,SUNRIDGE,,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",,30.0,g,,LI,...,0,0,0,0,0,0,0,,,16
1547353,Edward Leeds & Company,,,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",,30.0,g,0.25 cup,LI,...,0,0,0,0,0,0,0,0.25,cup,16
1548733,Edward Leeds & Company,SUNRIDGE,,000000016872,"ROASTED PEANUTS (PEANUTS, PEANUT OR CANOLA OIL...",,30.0,g,,LI,...,0,0,0,0,0,0,0,,,16
150787,Edward Leeds & Company,SUNRIDGE,,000000018265,"YOGURT RAISINS, TAMARI ROASTED ALMONDS, ORGANI...",,40.0,g,,LI,...,0,0,0,0,0,0,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151146,WHOLE FOODS MARKET,,,999482001585,"TAPIOCA SYRUP, CANE SUGAR, CORN STARCH - MODIF...",,30.0,g,6 PIECES,LI,...,0,0,0,0,0,0,0,6.00,pieces,1
48837,CIRCLE K,,,999995377214,"SUGAR, BUTTER(MILK), WHEAT FLOUR(WHEAT FLOUR, ...",,33.0,g,,LI,...,0,0,0,0,0,0,0,,,4
60137,CIRCLE K,,,999995377214,"SUGAR, BUTTER(MILK), WHEAT FLOUR(WHEAT FLOUR, ...",,33.0,g,1.16 ONZ,LI,...,0,0,0,0,0,0,0,1.16,onz,4
1486665,NOT A BRANDED ITEM,,,OldCountryStore,"CURED WITH WATER, SALT, SUGAR, SODIUM PHOSPHAT...",,13.0,g,1 Fried **,LI,...,0,0,0,0,0,0,0,1.00,fried **,71


In [77]:
#branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

In [68]:
branded_food_minimal.memory_usage(deep=True).sum() # About 862 megabytes

1488143307

In [69]:
cat_df.memory_usage(deep=True).sum()

16700

Finally, we have done multiple normalizations and reductions in memory, and are now ready to write it to disk. 

In [None]:
branded_food.to_csv('data/branded_food_reduced.csv')

In [None]:
cat_df.to_csv('data/branded_food_categories.csv')

In [None]:
brand_owners_df.to_csv('data/brand_owners.csv')

In [None]:
food_to_ingredient_df.to_csv('data/food_to_ingredient.csv')

In [None]:
ingredients_df.to_csv('data/ingredients.csv')

## Webapp specific changes

In order to make the dataset as small as possible, we will drop additional columns that most likely cannot be fully made use of.

In [None]:
branded_food.drop(columns='ingredients')