In [56]:
import pandas as pd
from nltk.stem import PorterStemmer
import time
import spacy
# from sentence_similarity import sentence_similarity
from math import sqrt

In [2]:
ps = PorterStemmer() # stemming for better results 

zehrs_regular_index = {}
zehrs_flyer_index = {}

regular_priced = pd.read_csv(f'clean_data/{"zehrs"}/regular_prices.csv')
flyer = pd.read_csv(f'clean_data/{"zehrs"}/flyer_deals.csv')


for index, row in regular_priced.iterrows():
    product_name = row['product']
#     brand = row['brand'] # consider adding brand into index if we want customers to be brand specific 
    
    product = ps.stem(product_name).lower()
    
    for word in product.split(): 
        if word not in zehrs_regular_index.keys():
            zehrs_regular_index[word] = [index]
        else:
            zehrs_regular_index[word].append(index)
            
for index, row in flyer.iterrows():
    product_name = row['product_name']
#     brand = row['brand'] # consider adding brand into index if we want customers to be brand specific 
    try:
        product = ps.stem(product_name).lower()
#         print(product)
        for word in product.split(): 
            if word not in zehrs_flyer_index.keys():
                zehrs_flyer_index[word] = [index]
            else:
                zehrs_flyer_index[word].append(index)
    except:
        continue

In [3]:
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [9]:
grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef', 
                'clementines', 'chicken breast', 'potatos']


zehrs_results = pd.DataFrame()

store = 'zehrs'

final_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])

for item in grocery_list:
    item_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
    
    stem_item = ps.stem(item).lower()
#     print(item)
    idxs = []
    
    for word in stem_item.split():
        word_idxs = zehrs_regular_index[word]
        idxs.extend(word_idxs)
        
    df = regular_priced.iloc[idxs]
    
#     print(df.head())
    
    for index, row in df.iterrows():
        product_name = row['product']
        per_unit_price = row['per_unit_price2']
        price = row['price2']
        
        similarity = jaccard_similarity(stem_item.split(' '), ps.stem(product_name).lower().split(' '))

        if similarity >= 0.5: # can tweak threshold but this is a good one for now  
            data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price':per_unit_price, 'similarity':similarity, 'source': 'reg' }
            item_selection = item_selection.append(data, ignore_index=True)
            
    cheapest_item = item_selection.sort_values(by=['per_unit_price', 'similarity'], ascending = [True, False])

    final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)

In [11]:
final_selection

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,zehrs,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,zehrs,"Beef Burger, Cheddar Cheese",14.99,0.0133,reg,0.5
2,white sliced bread,zehrs,White Bread,2.69,0.004,reg,0.666667
3,ground beef,zehrs,Lean Ground Beef,10.0,0.011,reg,0.666667
4,clementines,zehrs,Organic Clementines,9.99,0.011,reg,0.5
5,chicken breast,zehrs,"Chicken Breast Fillets, Boneless",8.72,0.01982,reg,0.5
6,potatos,zehrs,Russet Potatoes,2.99,0.0007,reg,0.5


## Putting it all together

In [39]:
# build index for each store 

ps = PorterStemmer() # stemming for better results 

stores = ['zehrs', 'no_frills', 'valu_mart']


# make variables 
for store in stores:
    globals()[f"{store}_regular_index"] = {} # dynamically create variable names 
    globals()[f"{store}_flyer_index"] = {}

for store in stores:

    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')


    for index, row in regular_priced.iterrows():
        product_name = row['product']

        product = ps.stem(product_name).lower()

        for word in product.split(): 
            if word not in globals()[f"{store}_regular_index"].keys():
                globals()[f"{store}_regular_index"][word] = [index]
            else:
                globals()[f"{store}_regular_index"][word].append(index)

    for index, row in flyer.iterrows():
        product_name = row['product_name']
        
        try: # in case of numbers only in word 
            product = ps.stem(product_name).lower()

            for word in product.split(): 
                if word not in globals()[f"{store}_flyer_index"].keys():
                    globals()[f"{store}_flyer_index"][word] = [index]
                else:
                    globals()[f"{store}_flyer_index"][word].append(index)
        except:
            continue

In [57]:
def squared_sum(x):
    """ return 3 rounded square rooted value """
    return round(sqrt(sum([a*a for a in x])),3)


def cos_similarity(x,y):
    """ return cosine similarity between two lists """
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = squared_sum(x)*squared_sum(y)
    return round(numerator/float(denominator),3)


In [62]:
# search and find 

start_time = time.time()

# model= sentence_similarity(model_name='distilbert-base-uncased',embedding_type='sentence_embedding')
nlp = spacy.load('en_core_web_sm')

print('loaded sim model in', time.time() - start_time, 'seconds')

ps = PorterStemmer() # stemming for better results 

grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef', 
                'clementines', 'chicken breast', 'potatoes']


stores = ['zehrs', 'no_frills', 'valu_mart']


# make variables 
for store in stores:
    globals()[f"{store}_results"] = pd.DataFrame() # dynamically create variable names 

    
for store in stores:
    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
    
    final_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
    
    for item in grocery_list:
        item_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])

        stem_item = ps.stem(item).lower()
        reg_idxs = []
        flyer_idxs = []

        for word in stem_item.split():
            try: # list item word not in index
                word_idxs = globals()[f"{store}_regular_index"][word]
                reg_idxs.extend(word_idxs)
            except: continue

        if not reg_idxs: # no indicies returned
            reg_df = pd.DataFrame() # no results: return empty df
        else:
            reg_df = regular_priced.iloc[reg_idxs]
        
        
        for word in stem_item.split():
            try: # list item word not in index
                word_idxs = globals()[f"{store}_flyer_index"][word]
                flyer_idxs.extend(word_idxs)
            except: continue

        if not flyer_idxs: # no indicies returned
            flyer_df = pd.DataFrame() # no results: return empty df
        else:
            flyer_df = regular_priced.iloc[flyer_idxs]


#         print(len(reg_df))
#         print(len(flyer_df))

        for index, row in reg_df.iterrows():
            product_name = row['product']
            per_unit_price = row['per_unit_price2']
            price = row['price2']

            similarity = jaccard_similarity(stem_item.split(' '), ps.stem(product_name).lower().split(' '))

#             similarity = model.get_score(stem_item.lower(), ps.stem(product_name).lower(),metric="cosine")

#             strings = [ps.stem(product_name).lower(), stem_item.lower()]
#             embeddings = [nlp(string).vector for string in strings]
#             similarity = cos_similarity(embeddings[0], embeddings[1])
    
            if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price':per_unit_price, 'similarity':similarity, 'source': 'reg' }
                item_selection = item_selection.append(data, ignore_index=True)
                
                
        ##### search flyer data #####
        for index, row in flyer_df.iterrows():

            try: 
                product_name = row['product_name'].replace(',', '')
                price = row['price2']
                per_unit_price = row.per_unit_price2

                # find items  
                similarity = jaccard_similarity(stem_item.split(' '), ps.stem(product_name).lower().split(' '))

#                 similarity = model.get_score(stem_item.lower(), ps.stem(product_name).lower(),metric="cosine")

#                 strings = [ps.stem(product_name).lower(), stem_item.lower()]
#                 embeddings = [nlp(string).vector for string in strings]
#                 similarity = cos_similarity(embeddings[0], embeddings[1])
                    
                if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                    data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price': per_unit_price, 'similarity':similarity, 'source': 'flyer' }
                    item_selection = item_selection.append(data, ignore_index=True)
               
            except: continue

        try:
            # find lowest price from top similarities
            
            # ************** need to decide between per unit pricing and total price difference
            # ************** maybe some units need per unit some dont 
            cheapest_item = item_selection.sort_values(by=['per_unit_price', 'similarity'], ascending = [True, False])
#             cheapest_item = item_selection.sort_values(by=['price'])

            final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
        except: continue
            
    globals()[f"{store}_results"] = final_selection
    
print(time.time() - start_time, 'seconds')

loaded sim model in 0.6743087768554688 seconds
21.887358903884888 seconds


# outputs

In [63]:
zehrs_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,zehrs,Partly Skimmed Milk 1% MF,5.69,0.0014,reg,0.576
1,Cheddar Cheese,zehrs,1% M.F. Cottage Cheese,5.99,0.008,reg,0.581
2,white sliced bread,zehrs,White Potatoes,5.49,0.0024,reg,0.602
3,ground beef,zehrs,Beef Marrow Soup Bones,6.31,0.00769,reg,0.705
4,clementines,zehrs,Organic Clementines,9.99,0.011,reg,0.579
5,chicken breast,zehrs,Chicken Hot Dogs,2.49,0.0055,reg,0.597
6,potatoes,zehrs,Russet Potatoes,2.99,0.0007,reg,0.573


In [64]:
no_frills_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,no_frills,Partly Skimmed Milk 2% MF,5.69,0.0014,reg,0.591
1,Cheddar Cheese,no_frills,1% M.F. Cottage Cheese,4.59,0.0061,reg,0.581
2,white sliced bread,no_frills,White Potatoes,4.99,0.0022,reg,0.602
3,ground beef,no_frills,"Lean Ground Pork, Club Pack",10.93,0.00945,reg,0.619
4,clementines,no_frills,Clementines,3.99,0.0017,reg,1.0
5,chicken breast,no_frills,"Chicken Leg with Bone, Club Pack",4.16,0.00328,reg,0.653
6,potatoes,no_frills,Sweet Potato,1.37,0.00218,reg,0.567


In [50]:
valu_mart_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,valu_mart,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,valu_mart,Old Cheddar Cheese,9.79,0.014,reg,0.666667
2,white sliced bread,valu_mart,White Bread,2.79,0.0041,reg,0.666667
3,ground beef,valu_mart,Lean Ground Beef,4.98,0.011,reg,0.666667
4,clementines,valu_mart,Clementines,6.99,0.003,reg,1.0
5,potatos,valu_mart,Russet Potatoes,3.99,0.0009,reg,0.5


In [44]:
zehrs_per_unit_subtotal = round(zehrs_results.per_unit_price.sum(),3)
no_frills_per_unit_subtotal = round(no_frills_results.per_unit_price.sum(),3)
valu_mart_per_unit_subtotal = round(valu_mart_results.per_unit_price.sum(),3)

zehrs_subtotal = round(zehrs_results.price.sum(),2)
no_frills_subtotal = round(no_frills_results.price.sum(),2)
valu_mart_subtotal = round(valu_mart_results.price.sum(),2)

lowest_price = min(zehrs_per_unit_subtotal, no_frills_per_unit_subtotal, valu_mart_per_unit_subtotal)

print(lowest_price)
print(f'Zehrs: {zehrs_per_unit_subtotal}, {zehrs_subtotal}')
print(f'No Frills: {no_frills_per_unit_subtotal}, {no_frills_subtotal}')
print(f'Valu Mart: {valu_mart_per_unit_subtotal}, {valu_mart_subtotal}')

0.035
Zehrs: 0.062, 54.37
No Frills: 0.046, 38.12
Valu Mart: 0.035, 33.53
