# Dedupe ingredients

This notebook will get all unique ingredients that were extracted from a previous notebook and attempt to deduplicate them. The end product will be a list of unique ingredients

## Get a unique list of ingredients (contain dupes and misspellings)

In [1]:
!pip install pandas-dedupe



In [2]:
pip install py_stringmatching

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import pandas_dedupe
import py_stringmatching as sm
import numpy as np

from ast import literal_eval
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
ingredients_extracted_csv_p = r"../../data/ingredients_extracted.csv.gz"

#### Pull a previously cleaned data set

Make a dataframe of the queried data

In [5]:
%%time
ingredient_df = pd.read_csv(ingredients_extracted_csv_p, converters={"ingredients_tokenized": literal_eval})
ingredient_df.head()

CPU times: user 57.6 s, sys: 4.35 s, total: 1min 1s
Wall time: 1min 2s


Unnamed: 0,fdc_id,ingredients_tokenized
0,356068,"{PALM, SOY LECITHIN, SUGAR, CORNSTARCH, VEGETA..."
1,356069,"{CRUSHED TOMATOES, SALT, ONIONS, DISTILLED VIN..."
2,356070,"{NATURAL FLAVORS, PALM OILS, SOY LECITHIN, SUG..."
3,356071,"{CRUSHED TOMATOES, SALT, ONIONS, DISTILLED VIN..."
4,356072,"{SEA SALT, WHEY, CORNSTARCH, NATURAL FLAVOR, B..."


#### Get a unique list of ingredients

In [6]:
ingredient_list = ingredient_df['ingredients_tokenized'].to_list()
ingredient_list = list(chain.from_iterable(ingredient_list))
ingredient_list = set(ingredient_list)
len(ingredient_list)

149192

In [7]:
qg2_tok = sm.QgramTokenizer(qval=3, padding = False)

In [8]:
ingredient_list_df = pd.DataFrame(ingredient_list, columns = ["Ingredients"])
ingredient_list_df

Unnamed: 0,Ingredients
0,TRUE LEMON
1,ORGANIC COLLARD GREENS ORGANIC KALE
2,CHOCOLATY PIECES
3,MALTODEXTRIN†
4,E153
...,...
149187,CONTAINS LESS THAN 1 % OF SUNFLOWER OIL
149188,FROM PERU
149189,HERBAL EXTRACT
149190,WHOLE WHITE FLOUR


In [9]:
%%time
#Turn the list of tokenized ingredients into a sparse matrix
#establish and fit the CountVectorizer
cv = CountVectorizer(lowercase = False, tokenizer=qg2_tok.tokenize)
cv.fit(ingredient_list_df["Ingredients"])
IngredientsSparseMatrix = cv.transform(ingredient_list_df["Ingredients"])
#transpose the sparse matrix df. This will allow for matrix multiplication
IngredientsSparseMatrixTranspose = IngredientsSparseMatrix.T
print(IngredientsSparseMatrix.shape)



(149192, 15438)
CPU times: user 3.37 s, sys: 72.5 ms, total: 3.45 s
Wall time: 3.45 s


In [None]:
%%time
#this will result in OrgNames values as row indexes and IncName values as column indexes
AdjacencyMatrix = IngredientsSparseMatrix.dot(IngredientsSparseMatrixTranspose)
print(AdjacencyMatrix.shape)

## Dedupe the ingredients

In [None]:
ingredient_df = pandas_dedupe.dedupe_dataframe(ingredient_df, ["ingredients_tokenized"])

Importing data ...


#### Export the data

In [12]:
del ingredient_df['ingredients']

In [14]:
ingredient_df.to_csv(ingredients_extracted_csv_p, 
                   index = False, compression = "gzip")