In [8]:
import pandas as pd
from fuzzywuzzy import fuzz
import time 

In [9]:
# All stores 

stores = ['zehrs', 'no_frills', 'valu_mart']

all_data = pd.DataFrame()


# search all stores 
for store in stores:
    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
    

In [10]:
# https://newscatcherapi.com/blog/ultimate-guide-to-text-similarity-with-python

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [11]:
start_time = time.time()

store = 'zehrs'

regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')


# important to put descriptive items on you list - 2% milk vs milk, cheddar cheese vs cheddar cheese slices 
# the more descriptive the more likely you are to get what you're looking for 
# even adding Old/Medium to Cheddar cheese makes it more descriptive 

# in the data, it is likely that the items are also ordered by relevance (ie: cheddar cheese block vs slices)
# could do a secondary ordering by index

grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef']


final_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

for item in grocery_list: 
    item_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

    for index, row in regular_priced.iterrows():
        product_name = row['product']
        full_product = row['product_name']
        per_unit_price = row['per_unit_price2']
        price = row['price2']

        # find items  
        similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))

        if similarity >= 0.5: # can tweak threshold but this is a good one for now  
            data = { 'store':store, 'product_name':product_name, 'full_product':full_product, 'price':price, 'per_unit_price':per_unit_price, 'simmilarity':similarity }
            item_selection = item_selection.append(data, ignore_index=True)

    # find lowest price from top similarities
    cheapest_item = item_selection.sort_values(by=['per_unit_price'])

    final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
    
    
print(time.time() - start_time, 'seconds')

final_selection

1.1345748901367188 seconds


Unnamed: 0,store,product_name,price,full_product,per_unit_price,simmilarity
0,zehrs,2% Milk,4.99,Neilson2% Milk2 l,0.0025,1.0
1,zehrs,Cheddar Cheese Thin Slices,4.79,Black DiamondCheddar Cheese Thin Slices410 g,0.0117,0.5
2,zehrs,White Bread,2.69,Old MillWhite Bread675 g,0.004,0.666667
3,zehrs,Lean Ground Beef,10.0,Al SafaLean Ground Beef908 g,0.011,0.666667


In [12]:
start_time = time.time()

store = 'zehrs'

regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')

grocery_list = ['clementines', 'chicken breast', 'potatos', 'SHREDDED CHEESE']


final_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

for item in grocery_list: 
    
    item_selection = pd.DataFrame(columns = ['store', 'product_name', 'price'])

    for index, row in flyer.iterrows():
        
        try: 
            product_name = row['product_name'].replace(',', '')
            price = row['price2']
            per_unit_price = row.per_unit_price2
    
            # find items  
            similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))
#             print(similarity, product_name)
            if similarity >= 0.3: # can tweak threshold but this is a good one for now  
                data = { 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price': per_unit_price, 'simmilarity':similarity }
                item_selection = item_selection.append(data, ignore_index=True)
        except: continue

    try:
        # find lowest price from top similarities
        cheapest_item = item_selection.sort_values(by=['per_unit_price'])

        final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
    except: continue
    
    
print(time.time() - start_time, 'seconds')

final_selection

0.14462685585021973 seconds


Unnamed: 0,store,product_name,price,per_unit_price,simmilarity
0,zehrs,LARGE CLEMENTINES,3.99,3.99,0.5
1,zehrs,ZERTO SHREDDED CHEESE 140 g,7.49,0.0535,0.4


In [112]:
# Yellow Potato 10lb Bag,Farmer's MarketYellow Potato 10lb Bag10 lb bag
print('potato'.lower().split(' ')), print('Yellow Potato 10lb Bag'.lower())
jaccard_similarity('potato'.lower().split(' '), 'Yellow Potato 10lb Bag'.lower().split(' '))



['potato']
yellow potato 10lb bag


0.25

In [120]:
! python -m pip install --upgrade pip

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 4.4 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.0.2
    Uninstalling pip-20.0.2:
      Successfully uninstalled pip-20.0.2
Successfully installed pip-22.3.1


In [113]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
ps.stem('potatos')

ModuleNotFoundError: No module named 'nltk'

## pulling it all together 

In [96]:
start_time = time.time()

stores = ['zehrs', 'no_frills', 'valu_mart']

zehrs_results = pd.DataFrame()
no_frills_results = pd.DataFrame()
valu_mart_results = pd.DataFrame()

grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef', 
                'clementines', 'chicken breast', 'potato', 'shredded cheese', 'ribs']

for store in stores:
    regular_priced = pd.read_csv(f'clean_data/{store}/regular_prices.csv')
    flyer = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
    
    final_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
    
    for item in grocery_list: 
        item_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'source'])
        
        ##### search regular price data #####
        for index, row in regular_priced.iterrows():
            product_name = row['product']
            per_unit_price = row['per_unit_price2']
            price = row['price2']

            # find items  
            similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))

            if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price':per_unit_price, 'similarity':similarity, 'source': 'reg' }
                item_selection = item_selection.append(data, ignore_index=True)
    
    
        ##### search flyer data #####
        for index, row in flyer.iterrows():

            try: 
                product_name = row['product_name'].replace(',', '')
                price = row['price2']
                per_unit_price = row.per_unit_price2

                # find items  
                similarity = jaccard_similarity(item.lower().split(' '), product_name.lower().split(' '))
                
                if similarity >= 0.2: # can tweak threshold but this is a good one for now  
                    data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price': per_unit_price, 'similarity':similarity, 'source': 'flyer' }
                    item_selection = item_selection.append(data, ignore_index=True)
            except: continue


        try:
            # find lowest price from top similarities
            
            # ************** need to decide between per unit pricing and total price difference
            # ************** maybe some units need per unit some dont 
            cheapest_item = item_selection.sort_values(by=['per_unit_price', 'similarity'], ascending = [True, False])
#             cheapest_item = item_selection.sort_values(by=['price'])

            final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
        except: continue

            
    if store == 'zehrs':
        zehrs_results = final_selection
    elif store == 'no_frills':
        no_frills_results = final_selection
    elif store == 'valu_mart':
        valu_mart_results = final_selection  
    
print(time.time() - start_time, 'seconds')

4.019967079162598 seconds


In [97]:
# appears that loblaw sites show sale price regardless 

In [98]:
zehrs_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,zehrs,2% Milk,4.99,0.0025,reg,1.0
1,Cheddar Cheese,zehrs,Cheddar Cheese Thin Slices,4.79,0.0117,reg,0.5
2,white sliced bread,zehrs,White Bread,2.69,0.004,reg,0.666667
3,ground beef,zehrs,Lean Ground Beef,10.0,0.011,reg,0.666667
4,clementines,zehrs,Organic Clementines,9.99,0.011,reg,0.5
5,chicken breast,zehrs,"Chicken Breast Fillets, Boneless",8.72,0.01982,reg,0.5
6,shredded cheese,zehrs,Pizza Mozzarella Shredded Cheese,4.79,0.015,reg,0.5
7,ribs,zehrs,PORK BACK RIBS 2'S,2.99,0.006592,flyer,0.25


In [93]:
no_frills_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,no_frills,Partly Skimmed Milk 2% MF,5.69,0.0014,reg,0.4
1,Cheddar Cheese,no_frills,Cheese Buns,2.99,0.0085,reg,0.333333
2,white sliced bread,no_frills,White Bread,1.79,0.0027,reg,0.666667
3,ground beef,no_frills,Lean Ground Beef,10.0,0.011,reg,0.666667
4,clementines,no_frills,Clementines,3.99,0.0017,reg,1.0
5,chicken breast,no_frills,Chicken Strips,10.0,0.0067,reg,0.333333
6,shredded cheese,no_frills,Cheese Buns,2.99,0.0085,reg,0.333333
7,ribs,no_frills,Pork Back Ribs,19.81,0.01585,reg,0.333333


In [94]:
valu_mart_results

Unnamed: 0,list_item,store,product_name,price,per_unit_price,source,similarity
0,2% milk,valu_mart,Skim Milk,5.99,0.0015,reg,0.333333
1,Cheddar Cheese,valu_mart,Cheddar Flavour Processed Cheese Product Slices,4.29,0.0107,reg,0.333333
2,white sliced bread,valu_mart,White Bread Greek Pitas,1.59,0.0037,reg,0.4
3,ground beef,valu_mart,"Beef Ground Medium, Club Pack",11.26,0.0088,reg,0.4
4,clementines,valu_mart,Clementines,6.99,0.003,reg,1.0
5,chicken breast,valu_mart,Chicken Wiener,3.39,0.0075,reg,0.333333
6,shredded cheese,valu_mart,Cottage Cheese,3.29,0.0132,reg,0.333333
7,ribs,valu_mart,BBQ Pork Ribs,15.99,0.0267,reg,0.333333


In [95]:
zehrs_per_unit_subtotal = round(zehrs_results.per_unit_price.sum(),3)
no_frills_per_unit_subtotal = round(no_frills_results.per_unit_price.sum(),3)
valu_mart_per_unit_subtotal = round(valu_mart_results.per_unit_price.sum(),3)

zehrs_subtotal = round(zehrs_results.price.sum(),2)
no_frills_subtotal = round(no_frills_results.price.sum(),2)
valu_mart_subtotal = round(valu_mart_results.price.sum(),2)

lowest_price = min(zehrs_per_unit_subtotal, no_frills_per_unit_subtotal, valu_mart_per_unit_subtotal)

print(lowest_price)
print(f'Zehrs: {zehrs_per_unit_subtotal}, {zehrs_subtotal}')
print(f'No Frills: {no_frills_per_unit_subtotal}, {no_frills_subtotal}')
print(f'Valu Mart: {valu_mart_per_unit_subtotal}, {valu_mart_subtotal}')

0.056
Zehrs: 0.061, 54.44
No Frills: 0.056, 57.26
Valu Mart: 0.075, 52.79
