In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
import time 



In [2]:
# All stores 

stores = ['zehrs', 'no_frills', 'valu_mart']

all_data = pd.DataFrame()


# search all stores 
for store in stores:
    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
    

In [3]:
# https://newscatcherapi.com/blog/ultimate-guide-to-text-similarity-with-python

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [4]:
start_time = time.time()

store = 'zehrs'

regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')


# important to put descriptive items on you list - 2% milk vs milk, cheddar cheese vs cheddar cheese slices 
# the more descriptive the more likely you are to get what you're looking for 
# even adding Old/Medium to Cheddar cheese makes it more descriptive 

# in the data, it is likely that the items are also ordered by relevance (ie: cheddar cheese block vs slices)
# could do a secondary ordering by index

grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef']


final_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

for item in grocery_list: 
    item_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

    for index, row in regular_priced.iterrows():
        product_name = row['product']
        full_product = row['product_name']
        per_unit_price = row['per_unit_price2']
        price = row['price2']

        # find items  
        similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))

        if similarity >= 0.5: # can tweak threshold but this is a good one for now  
            data = { 'store':store, 'product_name':product_name, 'full_product':full_product, 'price':price, 'per_unit_price':per_unit_price, 'simmilarity':similarity }
            item_selection = item_selection.append(data, ignore_index=True)

    # find lowest price from top similarities
    cheapest_item = item_selection.sort_values(by=['per_unit_price'])

    final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
    
    
print(time.time() - start_time, 'seconds')

final_selection

0.8205277919769287 seconds


Unnamed: 0,store,product_name,price,full_product,per_unit_price,simmilarity
0,zehrs,2% Milk,4.99,Neilson2% Milk2 l,0.0025,1.0
1,zehrs,Cheddar Cheese Thin Slices,4.79,Black DiamondCheddar Cheese Thin Slices410 g,0.0117,0.5
2,zehrs,White Bread,2.69,Old MillWhite Bread675 g,0.004,0.666667
3,zehrs,Lean Ground Beef,10.0,Al SafaLean Ground Beef908 g,0.011,0.666667


In [5]:
start_time = time.time()

store = 'zehrs'

regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')

grocery_list = ['clementines', 'chicken breast', 'potatos', 'SHREDDED CHEESE']


final_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

for item in grocery_list: 
    
    item_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

    for index, row in flyer.iterrows():
        
        try: 
            product_name = row['product_name'].replace(',', '')
            price = row['price2']
            per_unit_price = row.per_unit_price2
    
            # find items  
            similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))
#             print(similarity, product_name)
            if similarity >= 0.3: # can tweak threshold but this is a good one for now  
                data = { 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price': per_unit_price, 'simmilarity':similarity }
                item_selection = item_selection.append(data, ignore_index=True)
        except: continue

    try:
        # find lowest price from top similarities
        cheapest_item = item_selection.sort_values(by=['per_unit_price'])

        final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
    except: continue
    
    
print(time.time() - start_time, 'seconds')

final_selection

0.09425687789916992 seconds


Unnamed: 0,store,product_name,price,per_unit_price,simmilarity
0,zehrs,LARGE CLEMENTINES,3.99,3.99,0.5
1,zehrs,ZERTO SHREDDED CHEESE 140 g,7.49,0.0535,0.4


# test out different text similarity methods

### jaccard

In [6]:
# Yellow Potato 10lb Bag,Farmer's MarketYellow Potato 10lb Bag10 lb bag
jaccard_similarity('potato'.lower().split(' '), 'Yellow Potato 10lb Bag'.lower().split(' '))


0.25

In [7]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

product_name = "Yellow Potato 10lb Bag"
item = 'potato'

jaccard_similarity(ps.stem(item).lower().split(' '), ps.stem(product_name).lower().split(' '))

0.25

### spaCy

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [29]:
from math import sqrt

# sentences = ["Chicken Breast, Boneless Skinless", 'chicken breast']

sentences = ["Yellow Potato 10lb Bag", 'potatos']


# embeddings[0].similarity(embeddings[1])


def squared_sum(x):
    """ return 3 rounded square rooted value """
    return round(sqrt(sum([a*a for a in x])),3)


def cos_similarity(x,y):
    """ return cosine similarity between two lists """
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)

start_time = time.time()
embeddings = [nlp(ps.stem(sentence.lower())).vector for sentence in sentences]
print(cos_similarity(embeddings[0], embeddings[1]))
print(time.time() - start_time, 'seconds')

0.365
0.01419687271118164 seconds


### sentence_similarity

In [28]:
# from sentence_similarity import sentence_similarity
sentence_a = "CHICKEN BREASTS BONE-IN SKIN-ON OR SPLIT CHICKEN WINGS".lower()
sentence_b = "chicken breast".lower()

# model=sentence_similarity(model_name='distilbert-base-uncased',embedding_type='sentence_embedding')

start_time = time.time()
print(model.get_score(sentence_a,sentence_b,metric="cosine"))
print(time.time() - start_time, 'seconds')

0.691
0.12379193305969238 seconds


### Pytorch and HuggingFace

In [13]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
regular_priced = pd.read_csv(f'clean_data/zehrs/regular_prices.csv')

documents = list(regular_priced['product'])

vectors = [
  # tokenize the document, return it as PyTorch tensors (vectors),
  # and pass it onto the model
  model(**tokenizer(document, return_tensors='pt'))[0].detach().squeeze()
  for document in documents
]

# [v.size() for v in vectors]

In [15]:
import torch

averaged_vectors = [torch.mean(vector, dim=0) for vector in vectors]

# [v.size() for v in averaged_vectors]

In [16]:
def encode(document):
    tokens = tokenizer(document, return_tensors='pt')
    vector = model(**tokens)[0].detach().squeeze()
    return torch.mean(vector, dim=0)

In [17]:
import faiss # conda install -c conda-forge faiss
import numpy as np

In [18]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(768)) # the size of our vector space

# index all the documents, we need them as numpy arrays first

index.add_with_ids(
    np.array([t.numpy() for t in averaged_vectors]),
    # the IDs will be 0 to len(documents)
    np.array(range(0, len(documents)))
)

def search(query: str, k=1):
    encoded_query = encode(query).unsqueeze(dim=0).numpy()
    top_k = index.search(encoded_query, k)
    scores = top_k[0][0]
    results = [documents[_id] for _id in top_k[1][0]]
    return list(zip(results, scores))

In [22]:
search("chicken breast", k=10)

[('Yellow Onions', 55.3908),
 ('Red Onions', 55.19753),
 ('Red Onion', 55.006783),
 ('Yellow Onion', 54.83129),
 ('Pink Salmon', 54.356255),
 ('Sweet Potatoes', 53.97739),
 ('Green Beans', 53.90656),
 ('Pork Liver', 53.469444),
 ('Turkey Breast Roast', 53.378643),
 ('Turkey Breast Roast', 53.378643)]

## pulling it all together 

In [None]:
model=sentence_similarity(model_name='distilbert-base-uncased',embedding_type='sentence_embedding')

In [78]:
start_time = time.time()

# ps = PorterStemmer() # stemming for better results 

stores = ['zehrs', 'no_frills', 'valu_mart']

zehrs_results = pd.DataFrame()
no_frills_results = pd.DataFrame()
valu_mart_results = pd.DataFrame()

grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef', 
                'clementines', 'chicken breast', 'potato', 'shredded cheese', 'ribs']

for store in stores:
    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
    
    final_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
    
    for item in grocery_list: 
        item_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
        
        ##### search regular price data #####
        for index, row in regular_priced.iterrows():
            product_name = row['product']
            per_unit_price = row['per_unit_price2']
            price = row['price2']

            # find items 
            
            # embed text  
#             embeddings = [nlp(ps.stem(sentence.lower())).vector for sentence in [item, product_name]]
#             similarity = cos_similarity(embeddings[0], embeddings[1])
            
            similarity = model.get_score(item.lower(),product_name.lower(),metric="cosine")
            
#             similarity = jaccard_similarity(ps.stem(item).lower().split(' '), ps.stem(product_name).lower().split(' '))

            if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price':per_unit_price, 'similarity':similarity, 'source': 'reg' }
                item_selection = item_selection.append(data, ignore_index=True)
    

        ##### search flyer data #####
        for index, row in flyer.iterrows():

            try: 
                product_name = row['product_name'].replace(',', '')
                price = row['price2']
                per_unit_price = row.per_unit_price2

                # find items  
                similarity = jaccard_similarity(ps.stem(item).lower().split(' '), pas.stem(product_name).lower().split(' '))
                
                if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                    data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price': per_unit_price, 'similarity':similarity, 'source': 'flyer' }
                    item_selection = item_selection.append(data, ignore_index=True)
               
            except: continue


        try:
            # find lowest price from top similarities
            
            # ************** need to decide between per unit pricing and total price difference
            # ************** maybe some units need per unit some dont 
            cheapest_item = item_selection.sort_values(by=['per_unit_price', 'similarity'], ascending = [True, False])
#             cheapest_item = item_selection.sort_values(by=['price'])

            final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
        except: continue

            
    if store == 'zehrs':
        zehrs_results = final_selection
    elif store == 'no_frills':
        no_frills_results = final_selection
    elif store == 'valu_mart':
        valu_mart_results = final_selection  
    
print(time.time() - start_time, 'seconds')

KeyboardInterrupt: 

In [97]:
# appears that loblaw sites show sale price regardless 

In [26]:
zehrs_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,zehrs,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,zehrs,"Cheddar Cheese Style Light, 22 Slices",4.79,0.0117,reg,0.5
2,white sliced bread,zehrs,White Bread,2.69,0.004,reg,0.666667
3,ground beef,zehrs,Lean Ground Beef,10.0,0.011,reg,0.666667
4,clementines,zehrs,Organic Clementines,9.99,0.011,reg,0.5
5,chicken breast,zehrs,"Chicken Breast, Bone-in Skin On, Club Pack",9.36,0.00659,reg,0.5
6,potato,zehrs,Yellow Potato 10lb Bag,2.99,0.0007,reg,0.5
7,shredded cheese,zehrs,Triple Cheddar Shredded Cheese Blend,4.79,0.015,reg,0.5
8,ribs,zehrs,Pork Back Ribs,15.29,0.00659,reg,0.5


In [27]:
no_frills_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,no_frills,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,no_frills,Processed Cheddar Cheese Slices Thick,3.79,0.0092,reg,0.5
2,white sliced bread,no_frills,White Bread,1.79,0.0027,reg,0.666667
3,ground beef,no_frills,Lean Ground Beef,10.0,0.011,reg,0.666667
4,clementines,no_frills,Clementines,3.99,0.0017,reg,1.0
5,chicken breast,no_frills,Chicken Breast Strips,10.99,0.0137,reg,0.666667
6,potato,no_frills,Naturally Imperfect Potatoes 15lb,6.0,0.0009,reg,0.5
7,shredded cheese,no_frills,Shredded Marble Farmer's Cheese,13.99,0.0155,reg,0.5
8,ribs,no_frills,Caribbean Sweet Potatoes,0.88,0.00284,reg,0.5


In [28]:
valu_mart_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,valu_mart,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,valu_mart,Processed Cheddar Cheese Slices Thick,4.99,0.0122,reg,0.5
2,white sliced bread,valu_mart,White Bread,2.79,0.0041,reg,0.666667
3,ground beef,valu_mart,Lean Ground Beef,4.98,0.011,reg,0.666667
4,clementines,valu_mart,Clementines,6.99,0.003,reg,1.0
5,chicken breast,valu_mart,"Chicken Breast Skinless Bone In, Club Pack",17.56,0.01254,reg,0.5
6,potato,valu_mart,Russet Potatoes,3.99,0.0009,reg,0.5
7,shredded cheese,valu_mart,Shredded Marble Farmer's Cheese,14.99,0.0167,reg,0.5
8,ribs,valu_mart,"Pork Rib Roast, Boneless",6.96,0.00769,reg,0.5


In [30]:
zehrs_per_unit_subtotal = round(zehrs_results.per_unit_price.sum(),3)
no_frills_per_unit_subtotal = round(no_frills_results.per_unit_price.sum(),3)
valu_mart_per_unit_subtotal = round(valu_mart_results.per_unit_price.sum(),3)

zehrs_subtotal = round(zehrs_results.price.sum(),2)
no_frills_subtotal = round(no_frills_results.price.sum(),2)
valu_mart_subtotal = round(valu_mart_results.price.sum(),2)

lowest_price = min(zehrs_per_unit_subtotal, no_frills_per_unit_subtotal, valu_mart_per_unit_subtotal)

print(lowest_price)
print(f'Zehrs: {zehrs_per_unit_subtotal}, {zehrs_subtotal}')
print(f'No Frills: {no_frills_per_unit_subtotal}, {no_frills_subtotal}')
print(f'Valu Mart: {valu_mart_per_unit_subtotal}, {valu_mart_subtotal}')

0.06
Zehrs: 0.069, 64.89
No Frills: 0.06, 56.42
Valu Mart: 0.071, 68.24
