# Branded food data frame analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from scipy.stats import ks_2samp
import nltk
import regex

In [2]:
filepath = lambda x: os.path.join('data', x)

In [3]:
branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={
    'brand_owner':str,
    'brand_name':str,
    'subbrand_name':str,
    'gtin_upc':str})
branded_food.head()

  branded_food = pd.read_csv(filepath('branded_food.csv'), dtype={


Unnamed: 0,fdc_id,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,discontinued_date,preparation_state_code,trade_channel,short_description
0,1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,,,,
1,1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,,,,
2,1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
3,1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,,
4,1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,,,,


Because fdc_id is unique, we will set that as our index:

In [4]:
branded_food.fdc_id = branded_food.fdc_id.astype(np.int32)
branded_food = branded_food.set_index('fdc_id')

In [5]:
branded_food.memory_usage(deep=True).sum() / 1e9 # Total gigabyte size

2.25206429

Going through the initial inspection of the dataset:

- Missingness/null values (done)
- incorrect dtypes (done)
- Reducing of columns/memory if possible (done)
- Validation of data (outlier searching and correction of incorrect values)

## Initial Missingness check

In [6]:
branded_food.isnull().mean()

brand_owner                    0.007813
brand_name                     0.296762
subbrand_name                  0.952827
gtin_upc                       0.000000
ingredients                    0.002923
not_a_significant_source_of    0.960171
serving_size                   0.005842
serving_size_unit              0.010312
household_serving_fulltext     0.585412
branded_food_category          0.005729
data_source                    0.000000
package_weight                 0.617388
modified_date                  0.000011
available_date                 0.000000
market_country                 0.000000
discontinued_date              1.000000
preparation_state_code         0.978783
trade_channel                  0.991557
short_description              0.978720
dtype: float64

We see that there are some columns with missing data, mainly towards the end, including discontinued_date, preparation_state_code, trade_channel, and short_description, which we will take a look through non-null examples and drop if necessary to preserve memory.

# Preparation_state_code
Preparation state code seems to be a code stating the state of a food in terms of readiness to consumption. There are ready to eat/drink options, unprepared options, prepared options, as well as what is possibly needed to consume the item. There however seems to be some inconsistency in what qualifies as unprepared and prepared however; cereals are both prepared and unprepared.

In [7]:
has_prep_code = branded_food[branded_food.preparation_state_code.notnull()][['brand_owner', 'brand_name', 'subbrand_name', 'preparation_state_code', 'short_description']]
has_prep_code.head(10)

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,preparation_state_code,short_description
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2219410,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,UNPREPARED,HSW Fh 93% Grd Tky Chub 12/1
2219411,Cargill Incorporated/Honeysuckle White,Honeysuckle White,,UNPREPARED,HSW Fh Gr WhtDry Ex Wt 6/1.25#
2219412,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,UNPREPARED,HSW Fh 85/15 Gr tky Ex Wt 6/1.25
2219413,Kellogg Company US,Kellogg's Pop-Tarts,,UNPREPARED,Pop-Tarts
2219414,Kellogg Company US,Kellogg's Cheez It,,UNPREPARED,Gripz Crackers
2219415,Kellogg Company US,Kellogg's Cheez It,,UNPREPARED,Cheez-It Crackers
2219416,Kellogg Company US,Kellogg's,,PREPARED,Froot Loops Cereal
2219417,Kellogg Company US,Kellogg's Eggo,,UNPREPARED,Pancakes
2219418,Kellogg Company US,Kellogg's,,UNPREPARED,Kellogg Cracker Brand
2219419,Kellogg Company US,Kellogg's,,PREPARED,Cinnabon Cereal


In addition, we see that while subbrand name is empty, short description may have a better description of what exact product the food is.

In [8]:
cereals = has_prep_code[has_prep_code.short_description.str.lower().str.contains('cereal').fillna(False)]
cereals.head(10)

  cereals = has_prep_code[has_prep_code.short_description.str.lower().str.contains('cereal').fillna(False)]


Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,preparation_state_code,short_description
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2219416,Kellogg Company US,Kellogg's,,PREPARED,Froot Loops Cereal
2219419,Kellogg Company US,Kellogg's,,PREPARED,Cinnabon Cereal
2219428,Kellogg Company US,Kellogg's,,UNPREPARED,All Bran Cereal
2219432,Kellogg Company US,Kellogg's,,PREPARED,Corn Pops Cereal
2219433,Kellogg Company US,Kellogg's,,PREPARED,Corn Pops Cereal
2219434,Kellogg Company US,Kellogg's,,PREPARED,Corn Pops Cereal
2219435,Kashi US,Bear Naked,,UNPREPARED,Cereal
2219444,Kellogg Company US,Kellogg's,,PREPARED,All Bran Cereal
2219445,Kellogg Company US,Kellogg's,,PREPARED,Apple Jacks Cereal
2219449,Kellogg Company US,Kellogg's,,PREPARED,Froot Loops Cereal


In [9]:
cereals.preparation_state_code.value_counts()

preparation_state_code
UNPREPARED      959
PREPARED        615
READY_TO_EAT      1
Name: count, dtype: int64

In [10]:
branded_food[branded_food.short_description.notna()][['brand_owner', 'brand_name', 'subbrand_name', 'short_description']]

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,short_description
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2219410,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,HSW Fh 93% Grd Tky Chub 12/1
2219411,Cargill Incorporated/Honeysuckle White,Honeysuckle White,,HSW Fh Gr WhtDry Ex Wt 6/1.25#
2219412,Cargill Incorporated/Honeysuckle White,HONEYSUCKLE WHITE,,HSW Fh 85/15 Gr tky Ex Wt 6/1.25
2219413,Kellogg Company US,Kellogg's Pop-Tarts,,Pop-Tarts
2219414,Kellogg Company US,Kellogg's Cheez It,,Gripz Crackers
...,...,...,...,...
2551347,Del Mar Food Products Corp.,Del Mar Food Products Corp.,,"Strawberry, Sliced, Frozen 30 lbs."
2551348,Kraft Foods Inc.,KRAFT,,500LB MM VANILLA 1 CS
2551349,Bake Crafters Food Company,Bake Crafters,,"Biscuits, Btrmlk, Sl, 3"", Premium"
2551350,Homefree,Homefree,,"CKIES,ORG GINGER SNAP,MINI,GF,G&G"


In [11]:
branded_food.preparation_state_code.value_counts(dropna=False)

preparation_state_code
NaN               1806146
UNPREPARED          26631
PREPARED             5676
READY_TO_EAT         2830
READY_TO_DRINK       2677
BAKE                  603
HEAT_AND_SERVE        272
THAW                  147
FREEZE                 97
GRILL                  95
CONVECTION             36
UNSPECIFIED            22
FRY                    15
STEAM                  12
DEEP_FRY               12
ROAST                  11
BOIL                    7
MICROWAVE               4
STIR_FRY                4
Name: count, dtype: int64

We see that many of these are with respect to how to prepare the food for consumption, many of which are simply labeled "unprepared" or prepared. We also see that many of these preparation codes are disambiguous, as unprepared does contain also foods that would be considered ready to eat, such as Pop-Tarts or Cheez-its. Because of this, it may be hard to categorize food into explicitly "prepared" and "unprepared" categories. We do see that this column is fairly well defined, as there are not many unique values.

In [12]:
branded_food = branded_food.drop(columns=['discontinued_date', 'trade_channel'])

## Replacing short description with subbrand_name
Replaces it with short description only if the food is short description.

In [13]:
branded_food['subbrand_name'] = branded_food.subbrand_name.fillna(branded_food.short_description)

## GTIN UPC cleaning 

GTIN UPC is a common format/code used in many different PoS stores, which is extended to any item that has a barcode associated with it. On further looking at the data, we had to specify that the upc is stored as a string, as there are many examples where there are leading 0's as shown. Further looking shows that there are sometimes dashes or spaces, which we will remove to make the string more standardized.

In [14]:
branded_food.gtin_upc = branded_food.gtin_upc.str.replace('-', '').str.replace(' ', '')

In [15]:
branded_food[~branded_food.gtin_upc.str.isdigit()].gtin_upc

fdc_id
539755        NIELSENUK0002
573164         84279810254X
949336      OldCountryStore
950368       6005207001298>
1041929     OldCountryStore
1042843      6005207001298>
1044611        01541801360X
1048721       028000428501`
1063921           JARLSBERG
1065873               BOOST
1081623    HAPPYKIDORGANICS
1082317        0441154.3042
Name: gtin_upc, dtype: object

There are also some ids that simply do not correct gtin_upcs. Given how few there are that follow this irregularity, we may be able to simply drop these, although they probably do not need to be dropped.

## Serving sizes

We see that there are several options. We will look up what these units exactly mean (unabbreviated), and possibly merge any containing the same amount. We can also visualize the distributions of respective foods.

- g (gram)
- ml (mililiter, most likely for fluids)
- grm - unknown, will compare to gram's distribution for differences
- mlt - most likely another mililiter, but these have different distributions
- MG - possibly a milligram distribution
- IU - international units, similar to mililiters
- GM - possible alias for gram as well
- MC - unknown currently

In [16]:
branded_food.serving_size_unit.value_counts()

serving_size_unit
g      1522350
ml      243399
GRM      40597
MLT       8288
MG        7555
IU        3671
GM         346
MC          63
Name: count, dtype: int64

If we only look at these examples, we would consider them as conventionally the same. However, they may not be the same exactly. One way we can possibly further reinforce this idea is through a permutation test; if we assume that they're from the same distribution, then the two distributions should be similar to each other. We can do this through the scipy ks_2samp test function, which determines whether the distributions are different enough:

## Unit permutation tests

Because the amount of foods there are between the two, we will instead choose to sample the distributions to about 10,000 for each distribution. We will also eliminate some of the outliers, as we noticed from the descriptive statistics (mainly the max) of g foods.

In [17]:
# Perform permutation for all serving_size units based on serving size measured
test_stat = lambda x, y: ks_2samp(x, y, alternative='two-sided').pvalue # permutation test
get_serving_sizes = lambda x: branded_food[branded_food.serving_size_unit == x].serving_size
serving_units = branded_food.serving_size_unit.dropna().unique()

same_dist = []
for u_1 in serving_units:
    u_1_dist = get_serving_sizes(u_1)
    for u_2 in serving_units:
        if u_1 == u_2: 
            continue # no need
            
        u_2_dist = get_serving_sizes(u_2)
        # Perform ks_2samp test
        p_val = test_stat(
            u_1_dist,
            u_2_dist
        )
        
        if p_val > .05:
            print(u_1, u_2)
            same_dist.append((u1, u2))

In [18]:
same_dist

[]

We see that all units are completely different distributions from each other, including grm and g, as well as ml and mlt, which we may have expected to have the same distribution.

## Further memory reducing

If we want to be able to reduce the amount of memory required to hold the entire data frame. We will do this be attempting to reduce redundant information even further.

In [19]:
branded_food.memory_usage(deep=True)

Index                            7381188
brand_owner                    142327530
brand_name                     105382109
subbrand_name                   63845613
gtin_upc                       127462998
ingredients                    623353099
not_a_significant_source_of     68469344
serving_size                    14762376
serving_size_unit              106885302
household_serving_fulltext      84118618
branded_food_category          143595828
data_source                    109068029
package_weight                  84714471
modified_date                  123634199
available_date                 123634899
market_country                 129168556
preparation_state_code          60420254
short_description               60965509
dtype: int64

We can look at many string based columns to see if there is a better way to possibly reduce memory, as strings are usually unique, immutable, and take a lot of memory. 

In [20]:
branded_food.market_country.value_counts(dropna=False)

market_country
United States    1844180
New Zealand         1117
Name: count, dtype: int64

The market country column shows that many entries repeat United States, when a better approach may just be to one-hot encode whether the food is from the us or not. Since there are only two options, we can mark whether the food is for the american market with a 1, or for the new zealand market with a 0. This reduces memory usage from over 120 million bytes to simply 1 million bytes!

However, because we can simply ignore this column with little to no consequence, we will most likely drop it. We may look into it to see how different this subsection is compared to the dataset.

In [21]:
# Can be used, but will most likely drop.
# branded_food.market_country = (branded_food.market_country == 'United States').astype(np.int8)
#branded_food.market_country.memory_usage(deep=True)

Other memory optimizations is the conversion of the modified_date and available_date columns to datetime, which converts the issue of strings to simply individual objects, which reduces the bytes used tremendously as well; the byte count shrinks about 10x for both columns.

In [22]:
branded_food.modified_date.memory_usage(deep=True) / 1e9 # Before datetime

0.131015387

In [23]:
branded_food.modified_date = pd.to_datetime(branded_food.modified_date)
branded_food.available_date = pd.to_datetime(branded_food.available_date)

In [24]:
branded_food.modified_date.memory_usage(deep=True) / 1e9 # After datetime

0.022143564

In [25]:
branded_food.dtypes

brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                         object
preparation_state_code                 object
short_description                      object
dtype: object

In [26]:
branded_food.data_source.memory_usage(deep=True)

116449217

In [27]:
branded_food.data_source.value_counts() # Easily one-hot encodeable into possibly three different categories

data_source
LI        1748661
GDSN        95519
NZGDSN       1117
Name: count, dtype: int64

In [28]:
pd.get_dummies(branded_food.data_source) # One hot encodes easily

Unnamed: 0_level_0,GDSN,LI,NZGDSN
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1105904,True,False,False
1105905,True,False,False
1105906,True,False,False
1105907,True,False,False
1105908,True,False,False
...,...,...,...
2554910,False,True,False
2554911,False,True,False
2554912,False,True,False
2554913,False,True,False


If we wish to keep this data column, uncomment this

In [29]:
branded_food = pd.concat([branded_food, pd.get_dummies(branded_food.data_source)], axis=1) # One hot encodes the data source column
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,package_weight,modified_date,available_date,market_country,preparation_state_code,short_description,GDSN,LI,NZGDSN
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,Oils Edible,GDSN,,2020-10-02,2020-11-13,United States,,,True,False,False
1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-09-12,2020-11-13,United States,,,True,False,False
1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,True,False,False
1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,GDSN,,2020-09-01,2020-11-13,United States,,,True,False,False
1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,GDSN,,2020-10-03,2020-11-13,United States,,,True,False,False


## Insignificant sources
There is a lot of redundancy in this column, as many repeat the same insignificant sources, as many of them are simply nutritional value-related.

In [30]:
branded_food.not_a_significant_source_of.notna().mean()

0.039828818883897825

In [31]:
# Get process insignificant sources (a lot more processing for a more standard convention)
insignificant_processed = (
    branded_food.not_a_significant_source_of
    .str.lower()
    # Cut off repetition (not a significant source)
    .str.replace(r'^not ?a? ?significant ?source ?of:? ?', '', regex=True)
    # Fix comma spacing
    .str.replace('&', 'and', regex=False)
    .str.replace(',? ?and,?', ',', regex=True)
    .str.replace('\s*,\s*', ',', regex=True)
    # Remove any periods
    .str.replace('.', '', regex=False)
    # Fix spelling errors/inconsistency
    .str.replace(r'form', 'from')
    .str.replace('potass?(ium)?', 'potassium', regex=True)
    .str.replace('cholest(erol)?', 'cholesterol', regex=True)
    .str.replace(r'sat(urated)?\s*fat', 'satured fat', regex=True)
    .str.replace(r'trans ?fat', 'trans fat', regex=True)
    .str.replace(r'dietary ?fiber', 'dietary fiber', regex=True)
    .str.replace(r'\W+or\W+', ',', regex=True)
    .str.replace(r'total ?sugars', 'total sugars', regex=True)
    # Fix vitamin entries (sometimes have only the letter, abbreviated forms, missing spaces, etc)
    .str.replace(r'vit(amin)?\s?(\w)', r'vitamin \2', regex=True)
)#.str.split(',')

In [32]:
# Get more common insignificant sources
insignificant_sources_freq = insignificant_processed.str.split(',').explode().value_counts()
insignificant_sources_freq.head(20)

not_a_significant_source_of
iron                 57121
calcium              55694
cholesterol          50576
dietary fiber        49898
trans fat            49215
satured fat          43592
vitamin d            41752
potassium            31664
vitamin a            19474
vitamin c            17712
added sugars         13770
total sugars         10602
calories from fat     8198
sugars                6169
fiber                 4686
other nutrients       1820
addedsugars            964
protein                944
fat cal                830
sugar                  505
Name: count, dtype: int64

We see that while there are many with some errors (the c is alone sometimes due to writing conventions, ex. vitamin a, c, d), we have captured most insignificant sources from the foods. We will simply consider any insignificant nutrients that occur more than 1000 times as "important" in order to one hot encode these.

In [33]:
insignificant_sources = insignificant_sources_freq[insignificant_sources_freq > 1000].index
insignificant_sources

Index(['iron', 'calcium', 'cholesterol', 'dietary fiber', 'trans fat',
       'satured fat', 'vitamin d', 'potassium', 'vitamin a', 'vitamin c',
       'added sugars', 'total sugars', 'calories from fat', 'sugars', 'fiber',
       'other nutrients'],
      dtype='object', name='not_a_significant_source_of')

In [34]:
insignificant_processed[insignificant_processed.notna()].str.contains('dietary fiber')

fdc_id
1849687     True
1849735     True
1849762     True
1849763     True
1849769    False
           ...  
2554894     True
2554895     True
2554903    False
2554907    False
2554909    False
Name: not_a_significant_source_of, Length: 73496, dtype: bool

In [35]:
# Populate dictionary for inserting into dataset
new_source_columns = dict()
for source in insignificant_sources:
    # Create series telling whether the food lists nutrient or not
    source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
    new_source_columns[source] = source_series
    
new_source_columns['dietary fiber'].sum()

  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.contains(source, regex=False).fillna(False).astype(np.int8)
  source_series = insignificant_processed.str.

50688

In [36]:
# Populate the branded_food dataframe
for source, series in new_source_columns.items():
    # Format source name to be underlined and not spaced
    source = 'insig_' + source.replace(' ', '_')
    # Populate dataset
    branded_food[source] = series

In [37]:
branded_food.memory_usage(deep=True)['insig_iron'] * len(new_source_columns) # New memory usage

29524752

In [38]:
branded_food.memory_usage(deep=True)['not_a_significant_source_of'] # Old memory usage

68469344

From this, we can analyze what insignificant sources are in most products at the cost of a larger memory cost.

In [39]:
branded_food.dtypes

brand_owner                            object
brand_name                             object
subbrand_name                          object
gtin_upc                               object
ingredients                            object
not_a_significant_source_of            object
serving_size                          float64
serving_size_unit                      object
household_serving_fulltext             object
branded_food_category                  object
data_source                            object
package_weight                         object
modified_date                  datetime64[ns]
available_date                 datetime64[ns]
market_country                         object
preparation_state_code                 object
short_description                      object
GDSN                                     bool
LI                                       bool
NZGDSN                                   bool
insig_iron                               int8
insig_calcium                     

Many other objects, however, cannot exactly be further condensed.

In [40]:
def fraction_to_number(frac):
    """ Convert fraction to number. """
    assert '/' in frac
    
    num, denom = frac.split('/')
    
    num = '1' if len(num) == 0 else num
    denom = '1' if len(denom) == 0 else denom
    return float(num) / float(denom)

In [41]:
def to_decimal(num_str):
    """ Converts num_str to float. """
    
    # Return if null
    if pd.isna(num_str): return num_str
    
    # Attempt to return anything that is simply one number
    try:
        if '/' not in num_str: return float(num_str)
    except:
        # There are some entries that have the format "\d \d" that should be divided
        #print(num_str)
        pass
    
    num_str = num_str.strip()
    # Special case for if there is a number followed by a fraction 
    # (can also be two numbers, which is assumed to be missing a fraction)
    if ' ' in num_str:
        groupings = num_str.split(' ')
        #print(num_str)
        if '/' in num_str:
            total = float(groupings[0]) + fraction_to_number(groupings[1])
        else:
            total = float(groupings[0]) / float(groupings[1])
    else:
        total = fraction_to_number(num_str)
    
    return total

In [42]:
household_servings = branded_food.household_serving_fulltext.str.lower().str.extract(r'(\d* ?\d+\.?/?\d*?) ([\w*\s*]+)')
household_servings.columns = ('household_serving_amount', 'household_serving_unit')
household_servings['household_serving_converted'] = household_servings['household_serving_amount'].apply(to_decimal)
household_servings

Unnamed: 0_level_0,household_serving_amount,household_serving_unit,household_serving_converted
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1105904,,,
1105905,,,
1105906,,,
1105907,,,
1105908,,,
...,...,...,...
2554910,,,
2554911,8,fl oz,8.0
2554912,1,cup,1.0
2554913,1,pc,1.0


In [43]:
household_servings.household_serving_unit.value_counts().head(10)

household_serving_unit
cup        161296
onz        102813
tbsp        84379
oza         39133
pieces      29061
tsp         24550
bar         20507
grm         15075
package     13446
slice       12182
Name: count, dtype: int64

One thing to note is that many of these are very unique and applicable to their respective food (eg. crackers), which is simply a representation of a piece. We have the choice of correcting some of these to simply a "piece" or "unit", but it is hard to do so with there being many different options, so we will simply leave the unit alone. In addition, we may be able to identify more specific foods with the serving unit.

In [44]:
branded_food[['household_serving_amount', 'household_serving_unit']] = household_servings[['household_serving_converted', 'household_serving_unit']]
branded_food[['household_serving_amount', 'household_serving_unit']].memory_usage(deep=True)

Index                        7381188
household_serving_amount    14762376
household_serving_unit      81177930
dtype: int64

In [45]:
branded_food.household_serving_fulltext.memory_usage(deep=True)

91499806

While we have almost all columns processed, we can finally drop all columns we have converted, including:
 - not_a_significant_source_of
 - household_serving_fulltext
 - data_source
 
We are also now able to analyze these previously unanalyzable data sources as well.

In [46]:
branded_food.memory_usage(deep=True).sum() # Data is almost reduced by about .75GB

2092446525

In [47]:
categories = branded_food.branded_food_category.value_counts()
categories.head()

branded_food_category
Popcorn, Peanuts, Seeds & Related Snacks    87691
Candy                                       83993
Cheese                                      76229
Ice Cream & Frozen Yogurt                   58400
Cookies & Biscuits                          50615
Name: count, dtype: int64

We will consider any categories with less than 100 categories as being null/insignificant, as the more rare categories are too specific at times.

In [48]:
categories[categories > 100]

branded_food_category
Popcorn, Peanuts, Seeds & Related Snacks                    87691
Candy                                                       83993
Cheese                                                      76229
Ice Cream & Frozen Yogurt                                   58400
Cookies & Biscuits                                          50615
                                                            ...  
Butter/Butter Substitutes                                     118
Chicken - Prepared/Processed                                  114
Meat/Poultry/Other Animals Sausages – Prepared/Processed      112
Baking                                                        106
Frozen Fish/Seafood                                           104
Name: count, Length: 183, dtype: int64

In [49]:
cat_df = pd.DataFrame(categories[categories>100].index)
cat_df.columns = ['category']
cat_df['category_id'] = cat_df.index
cat_df

Unnamed: 0,category,category_id
0,"Popcorn, Peanuts, Seeds & Related Snacks",0
1,Candy,1
2,Cheese,2
3,Ice Cream & Frozen Yogurt,3
4,Cookies & Biscuits,4
...,...,...
178,Butter/Butter Substitutes,178
179,Chicken - Prepared/Processed,179
180,Meat/Poultry/Other Animals Sausages – Prepared...,180
181,Baking,181


In [50]:
branded_food

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,...,insig_vitamin_a,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,Richardson Oilseed Products (US) Limited,,,00027000612323,Vegetable Oil,,15.0,ml,,Oils Edible,...,0,0,0,0,0,0,0,0,,
1105905,CAMPBELL SOUP COMPANY,,,00051000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,Herbs/Spices/Extracts,...,0,0,0,0,0,0,0,0,,
1105906,CAMPBELL SOUP COMPANY,,,00051000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,Prepared Soups,...,0,0,0,0,0,0,0,0,,
1105907,CAMPBELL SOUP COMPANY,,,00051000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,Prepared Soups,...,0,0,0,0,0,0,0,0,,
1105908,CAMPBELL SOUP COMPANY,,,00051000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,Herbs/Spices/Extracts,...,0,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2554910,Mt. Garfield Winery Corp,LIFESTYLEFOODS,,898425002682,ARCADIAN HARVEST LETTUCE (BLEND OF LEAF LETTUC...,,163.0,GRM,,"Pickles, Olives, Peppers & Relishes",...,0,0,0,0,0,0,0,0,,
2554911,All Market Inc.,VITA COCO,,898999012858,"COCONUT WATER FROM CONCENTRATE (WATER, COCONUT...",,240.0,MLT,8 fl oz,Plant Based Water,...,0,0,0,0,0,0,0,0,8.0,fl oz
2554912,Chaneys Dairy Barn,CHANEY'S,,899503001443,"WHOLE MILK, VITAMIN D3",,240.0,MLT,1 cup,Milk,...,0,0,0,0,0,0,0,0,1.0,cup
2554913,"Ittella International, Inc.",TATTOOED CHEF,,899764001527,"CAULIFLOWER, CORN FLOUR, GRANA PADANO CHEESE (...",,71.0,GRM,1 pc,Frozen Patties and Burgers,...,0,0,0,0,0,0,0,0,1.0,pc


In [51]:
break here

SyntaxError: invalid syntax (760050616.py, line 1)

In [53]:
branded_food = branded_food.reset_index()
branded_food = branded_food.merge(
    cat_df, 
    left_on='branded_food_category', 
    right_on='category').drop(columns=['branded_food_category', 'category'])
branded_food = branded_food.set_index('fdc_id')
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,164
1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,142
1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,GDSN,...,0,0,0,0,0,0,0,,,76
1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,GDSN,...,0,0,0,0,0,0,0,,,76
1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,142


## Brand Owners and Names normalization

We have many different brand_owners, so we can choose to offload this until we need it.

In [56]:
def create_normalized_df(df, col):
    # Creates a new dataframe to be joined 
    # to the original dataframe(normalized)
    new_df = pd.DataFrame(
        data={col:(
            branded_food[col]
              .dropna().unique()
        )}
    )
    new_df[col + '_id'] = new_df.index
    return new_df
    
    
def write_df_to_csv(df, path):
    if os.path.exists(path):
        # Delete file
        os.remove(path)
    with open(path, 'w', encoding='utf-8') as to_write:
        to_write.write(df.to_csv())

In [57]:
brand_owners = create_normalized_df(branded_food, 'brand_owner')
brand_owners

Unnamed: 0,brand_owner,brand_owner_id
0,Richardson Oilseed Products (US) Limited,0
1,CAMPBELL SOUP COMPANY,1
2,Bush Brothers And Company,2
3,PEPPERIDGE FARM,3
4,WELCH FOODS INC.,4
...,...,...
36268,"Mike and Jen's Cocoa Mixes, LLC",36268
36269,Natural Grains LLC,36269
36270,Utica Specialty Foods LLC,36270
36271,Tilvee,36271


In [58]:
brand_names = create_normalized_df(branded_food, 'brand_name')
brand_names

Unnamed: 0,brand_name,brand_name_id
0,Honeysuckle White,0
1,HONEYSUCKLE WHITE,1
2,HERSHEY'S,2
3,REESE'S,3
4,Wesson,4
...,...,...
34226,DEBBY'S,34226
34227,PRIDE OF INDIA,34227
34228,PICK,34228
34229,WILD GRILL FOODS,34229


In [59]:
# Save brand name and owner for later
clean_dir = 'cleaned'
if not os.path.exists(clean_dir):
    os.mkdir(clean_dir)

write_df_to_csv(brand_owners, os.path.join('cleaned', 'brand_owners.csv'))
write_df_to_csv(brand_names, os.path.join('cleaned', 'brand_names.csv'))


In [61]:
# Create ingredients list for every item
ingredients = (branded_food.ingredients.str.lower()
    .str.replace('[\.\*]', '', regex=True)
    .str.replace(' ?\([^\(\)]*\) ?', '', regex=True) #Removes items inside parenthesis, may not be best option
    .str.replace('ingredients: ?', '', regex=True)
    .str.replace('contains ?', '', regex=True) 
    .str.replace(' ?(less)? ?(than)?<? ?\d*.?\d*% ?(of)?:?', '', regex=True) # May be used to remove <2% ingredients with \s*
    .str.split(' ?, ?', regex=True)
)

In [62]:
# Create table with ingredients
ingredients_df = pd.DataFrame(
    data={
        'ingredient':ingredients.explode().unique()
    }
)
ingredients_df['ingredient_id'] = ingredients_df.index
ingredients_df

Unnamed: 0,ingredient,ingredient_id
0,vegetable oil,0
1,beef stock,1
2,mirepoix,2
3,salt,3
4,natural flavoring,4
...,...,...
230659,parmesan cheesetopping: mozzarella cheese,230659
230660,no nitrite or nitrate added except for natural...,230660
230661,arcadian harvest lettuce,230661
230662,ham with water and smoke flavor added,230662


In [63]:
# Create a normalization table indicating what ingredients are in each food
food_ingredients = (
    ingredients
    .explode()
    .reset_index()
    .rename(columns={'index':'fdc_id', 'ingredients':'ingredient'})
    .merge(ingredients_df)
    .rename(columns={'ingredientId':'ingredient_id'})
    .drop(columns=['ingredient'])
)
food_ingredients

Unnamed: 0,fdc_id,ingredient_id
0,1105904,0
1,1105905,1
2,1105905,2
3,1105905,3
4,1105905,4
...,...,...
19911387,2554913,18
19911388,2554914,41625
19911389,2554914,38439
19911390,2554914,55


In [64]:
write_df_to_csv(food_ingredients, os.path.join('cleaned', 'food_to_id.csv'))
del food_ingredients

With this we can now drop the ingredients column, thus saving more memory.

In [65]:
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,ingredients,not_a_significant_source_of,serving_size,serving_size_unit,household_serving_fulltext,data_source,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,Vegetable Oil,,15.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,164
1105905,CAMPBELL SOUP COMPANY,,,51000198808,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",,240.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,142
1105906,CAMPBELL SOUP COMPANY,,,51000213273,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",,440.0,g,,GDSN,...,0,0,0,0,0,0,0,,,76
1105907,CAMPBELL SOUP COMPANY,,,51000213303,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",,440.0,g,,GDSN,...,0,0,0,0,0,0,0,,,76
1105908,CAMPBELL SOUP COMPANY,,,51000224637,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",,240.0,ml,,GDSN,...,0,0,0,0,0,0,0,,,142


In [66]:
branded_food = branded_food.drop(columns=['ingredients', 'not_a_significant_source_of'])
branded_food.head()

Unnamed: 0_level_0,brand_owner,brand_name,subbrand_name,gtin_upc,serving_size,serving_size_unit,household_serving_fulltext,data_source,package_weight,modified_date,...,insig_vitamin_c,insig_added_sugars,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,Richardson Oilseed Products (US) Limited,,,27000612323,15.0,ml,,GDSN,,2020-10-02,...,0,0,0,0,0,0,0,,,164
1105905,CAMPBELL SOUP COMPANY,,,51000198808,240.0,ml,,GDSN,,2020-09-12,...,0,0,0,0,0,0,0,,,142
1105906,CAMPBELL SOUP COMPANY,,,51000213273,440.0,g,,GDSN,,2020-09-01,...,0,0,0,0,0,0,0,,,76
1105907,CAMPBELL SOUP COMPANY,,,51000213303,440.0,g,,GDSN,,2020-09-01,...,0,0,0,0,0,0,0,,,76
1105908,CAMPBELL SOUP COMPANY,,,51000224637,240.0,ml,,GDSN,,2020-10-03,...,0,0,0,0,0,0,0,,,142


In [67]:
# Replace brands with brand id for memory preservation
branded_food_minimal = (
    branded_food.reset_index()
    .merge(brand_owners, left_on='brand_owner', right_on='brand_owner', how='left')
    .merge(brand_names, left_on='brand_name', right_on='brand_name', how='left')
    .drop(columns=['brand_owner', 'brand_name', 'subbrand_name'])
    .drop(columns=['data_source', 'market_country', ])
)
branded_food_minimal = branded_food_minimal.set_index('fdc_id')
branded_food_minimal.head()

Unnamed: 0_level_0,gtin_upc,serving_size,serving_size_unit,household_serving_fulltext,package_weight,modified_date,available_date,preparation_state_code,short_description,GDSN,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,27000612323,15.0,ml,,,2020-10-02,2020-11-13,,,True,...,0,0,0,0,0,,,164,0.0,
1105905,51000198808,240.0,ml,,,2020-09-12,2020-11-13,,,True,...,0,0,0,0,0,,,142,1.0,
1105906,51000213273,440.0,g,,,2020-09-01,2020-11-13,,,True,...,0,0,0,0,0,,,76,1.0,
1105907,51000213303,440.0,g,,,2020-09-01,2020-11-13,,,True,...,0,0,0,0,0,,,76,1.0,
1105908,51000224637,240.0,ml,,,2020-10-03,2020-11-13,,,True,...,0,0,0,0,0,,,142,1.0,


In [68]:
branded_food_minimal.brand_name_id.isna().mean()

0.29290567364810033

In [69]:
branded_food.brand_name.memory_usage(deep=True)

112236267

In [70]:
branded_food_minimal.isna().mean()

gtin_upc                      0.000000
serving_size                  0.005887
serving_size_unit             0.010390
household_serving_fulltext    0.588362
package_weight                0.615807
modified_date                 0.000004
available_date                0.000000
preparation_state_code        0.979301
short_description             0.979240
GDSN                          0.000000
LI                            0.000000
NZGDSN                        0.000000
insig_iron                    0.000000
insig_calcium                 0.000000
insig_cholesterol             0.000000
insig_dietary_fiber           0.000000
insig_trans_fat               0.000000
insig_satured_fat             0.000000
insig_vitamin_d               0.000000
insig_potassium               0.000000
insig_vitamin_a               0.000000
insig_vitamin_c               0.000000
insig_added_sugars            0.000000
insig_total_sugars            0.000000
insig_calories_from_fat       0.000000
insig_sugars             

In [71]:
# Also replace brand name, subbrand is most likely alright
#branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

In [72]:
#branded_food_minimal.sort_values(['gtin_upc', 'available_date']).drop_duplicates(subset='gtin_upc', keep='last')

In [73]:
branded_food_minimal

Unnamed: 0_level_0,gtin_upc,serving_size,serving_size_unit,household_serving_fulltext,package_weight,modified_date,available_date,preparation_state_code,short_description,GDSN,...,insig_total_sugars,insig_calories_from_fat,insig_sugars,insig_fiber,insig_other_nutrients,household_serving_amount,household_serving_unit,category_id,brand_owner_id,brand_name_id
fdc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1105904,00027000612323,15.0,ml,,,2020-10-02,2020-11-13,,,True,...,0,0,0,0,0,,,164,0.0,
1105905,00051000198808,240.0,ml,,,2020-09-12,2020-11-13,,,True,...,0,0,0,0,0,,,142,1.0,
1105906,00051000213273,440.0,g,,,2020-09-01,2020-11-13,,,True,...,0,0,0,0,0,,,76,1.0,
1105907,00051000213303,440.0,g,,,2020-09-01,2020-11-13,,,True,...,0,0,0,0,0,,,76,1.0,
1105908,00051000224637,240.0,ml,,,2020-10-03,2020-11-13,,,True,...,0,0,0,0,0,,,142,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2554910,898425002682,163.0,GRM,,5.75 oz./163 g,2023-03-10,2023-05-25,,,False,...,0,0,0,0,0,,,7,34867.0,34230.0
2554911,898999012858,240.0,MLT,8 fl oz,500 mL/16.9 fl oz,2023-04-20,2023-05-25,,,False,...,0,0,0,0,0,8.0,fl oz,115,6813.0,5840.0
2554912,899503001443,240.0,MLT,1 cup,0.5 g/1.89 L,2023-04-20,2023-05-25,,,False,...,0,0,0,0,0,1.0,cup,25,34394.0,29593.0
2554913,899764001527,71.0,GRM,1 pc,10 oz/283 g,2023-03-23,2023-05-25,,,False,...,0,0,0,0,0,1.0,pc,82,34003.0,19195.0


In [74]:
branded_food_minimal.memory_usage(deep=True).sum() # About 862 megabytes

745401184

In [None]:
cat_df.memory_usage(deep=True).sum()

In [None]:
branded_food_minimal.head()

In [None]:
if not os.path.exists('cleaned'):
    os.mkdir('cleaned')

Finally, we have done multiple normalizations and reductions in memory, and are now ready to write it to disk. 

In [None]:
branded_food_minimal.to_csv('cleaned/branded_food_reduced.csv')

In [None]:
cat_df.to_csv('cleaned/branded_food_categories.csv')

In [None]:
brand_owners.to_csv('cleaned/brand_owners.csv')

In [None]:
ingredients_df.to_csv('cleaned/ingredients.csv')