In [1]:
import pandas as pd
from nltk.stem import PorterStemmer
import time
import sys
import ast
from pickle import load
import json 

ps = PorterStemmer() # stemming for better results 

In [2]:
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

In [6]:
def search(grocery_list, ps):
    stores = ['zehrs', 'no_frills', 'valu_mart', 'sobeys', 'freshco'] #, 'walmart', 'food_basics']

    # make variables 
    for store in stores:
        globals()[f"{store}_results"] = pd.DataFrame() # dynamically create variable names 

        
    for store in stores: # retrieval for each store 
        
        # load data
        store_data = pd.read_csv(f'clean_data/{store}/{store}_data.csv')

        # load index
        globals()[f"{store}_index"] = load(open(f"catalogue_index/{store}_index.pkl",'rb'))
        
        final_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'is_sale'])
        
        for item in grocery_list:
            item_selection = pd.DataFrame(columns = ['list_item', 'store', 'product_name', 'price', 'per_unit_price', 'is_sale'])

            stem_item = ps.stem(item).lower()
            idxs = []

            for word in stem_item.split():
                try: # list item word not in index
                    word_idxs = globals()[f"{store}_index"][word]
                    idxs.extend(word_idxs)
                except: continue

            if not idxs: # no indicies returned
                store_df = pd.DataFrame() # no results: return empty df
            else:
                store_df = store_data.iloc[idxs]


            ##### search items #####
            for index, row in store_df.iterrows():
                
                product_name = row['product']
                is_sale = row.is_sale

                if is_sale:
                    price = row.sale_price
                    per_unit_price = row.sale_per_unit_price
                else:
                    price = row.price
                    per_unit_price = row.per_unit_price

                similarity = jaccard_similarity(stem_item.split(' '), ps.stem(product_name).lower().split(' '))
        
                if similarity >= 0.5: # can tweak threshold but this is a good one for now  
                    data = { 'list_item':item, 'store':store, 'product_name':product_name, 'price':price, 'per_unit_price':per_unit_price, 'similarity':similarity, 'is_sale': is_sale}
                    item_selection = item_selection.append(data, ignore_index=True)    
           
            try:
                # find lowest price from top similarities
                cheapest_item = item_selection.sort_values(by=['per_unit_price', 'similarity'], ascending = [True, False])

                final_selection = final_selection.append(dict(cheapest_item.iloc[0]), ignore_index=True)
            except: continue
                
        globals()[f"{store}_results"] = final_selection
    
        # dump results to csv 
        globals()[f"{store}_results"].to_csv(f'search_output/{store}_results.csv')
    
    return {'zehrs': zehrs_results
        , 'no_frills': no_frills_results
        , 'valu_mart': valu_mart_results
        # , 'walmart': walmart_results
        , 'sobeys': sobeys_results
        , 'freshco': freshco_results}
        # , 'food_basics': food_basics_results}

In [7]:
def find_n_cheapest_stores(n, results):
    per_unit_subtotals = {}
    subtotals = {}
    cheapest_stores = {}

    for store in results.keys():
        result = results[store]
        per_unit_subtotal = round(result.per_unit_price.sum(),3)
        subtotal = round(result.price.sum(),3)

        per_unit_subtotals[store] = per_unit_subtotal
        subtotals[store] = subtotal

    for i in range(n):
        min_store = min(per_unit_subtotals, key=per_unit_subtotals.get)
        
        cost = subtotals[min_store]

        cheapest_stores[i+1] = {'store':store
                            , 'file_name': f'{store}_results.csv'
                            , 'subtotal': cost}

        remove_key = per_unit_subtotals.pop(min_store)
    
    return cheapest_stores

In [8]:
grocery_list = ['2% milk', 'Cheddar Cheese', 'white sliced bread', 'ground beef', 'clementines', 'chicken breast', 'potatoes']

results_dict = search(grocery_list, ps)

In [9]:
results_dict

{'zehrs':             list_item  store           product_name  price  per_unit_price  \
 0             2% milk  zehrs                2% Milk   4.99            0.25   
 1      Cheddar Cheese  zehrs  Medium Cheddar Cheese   9.79            1.40   
 2  white sliced bread  zehrs            White Bread   2.69            0.40   
 3         ground beef  zehrs       Lean Ground Beef  10.00            1.10   
 4         clementines  zehrs    Organic Clementines   9.99            1.10   
 5      chicken breast  zehrs  Chicken Breast Strips  13.99            1.75   
 6            potatoes  zehrs           Red Potatoes   5.99            0.26   
 
   is_sale  similarity  
 0   False    1.000000  
 1   False    0.666667  
 2   False    0.666667  
 3   False    0.666667  
 4   False    0.500000  
 5   False    0.666667  
 6   False    0.500000  ,
 'no_frills':             list_item      store                 product_name  price  \
 0             2% milk  no_frills          True Taste, 2% Milk   5.49 

# 2 store combination 

In [82]:
from itertools import combinations

In [83]:
list(combinations(results_dict.keys(), 2))

[('zehrs', 'no_frills'),
 ('zehrs', 'valu_mart'),
 ('zehrs', 'sobeys'),
 ('zehrs', 'freshco'),
 ('no_frills', 'valu_mart'),
 ('no_frills', 'sobeys'),
 ('no_frills', 'freshco'),
 ('valu_mart', 'sobeys'),
 ('valu_mart', 'freshco'),
 ('sobeys', 'freshco')]

In [84]:
out = pd.data_frame()

possibilities = list(combinations(results_dict.keys(), 2))
for combs in possibilities:
    for item in grocery_list: 
        df1 = pd.read_csv('search_output/valu_mart_results.csv')
        df2 = pd.read_csv('search_output/freshco_results.csv')

        

In [87]:
def choose_lowest(df1, df2, column_name):
    # Create a new DataFrame with the minimum value of the column for each row
    df = pd.DataFrame(columns=[column_name])
    df['list_item'] = df1.list_item
    df[column_name] = df1[column_name].combine(df2[column_name], min)
    return df

df1 = pd.read_csv('search_output/valu_mart_results.csv')
df2 = pd.read_csv('search_output/freshco_results.csv')

choose_lowest(df1, df2, 'comparable_price')

Unnamed: 0,comparable_price,list_item
0,0.309254,2% milk
1,1.4,Cheddar Cheese
2,0.50507,white sliced bread
3,1.392842,ground beef
4,0.39,clementines


In [74]:
df1[['list_item', 'comparable_price']]

Unnamed: 0,list_item,comparable_price
0,2% milk,0.35
1,Cheddar Cheese,1.4
2,white sliced bread,0.56
3,ground beef,15.41
4,clementines,0.39


In [75]:
df2[['list_item', 'comparable_price']]

Unnamed: 0,list_item,comparable_price
0,2% milk,0.309254
1,Cheddar Cheese,1.955578
2,white sliced bread,0.50507
3,ground beef,1.392842
4,chicken breast,1.741052


In [None]:
import itertools

def lowest_price_multiple_df(df_list, column_name):
    results = []
    for df1, df2 in itertools.combinations(df_list, 2):
        result = lowest_price(df1, df2, column_name)
        results.append(result)
    return results

df1 = pd.DataFrame({'per_unit_price': [1, 2, 3, 4]})
df2 = pd.DataFrame({'per_unit_price': [4, 3, 2, 1]})
df3 = pd.DataFrame({'per_unit_price': [2, 3, 4, 5]})
df_list = [df1, df2, df3]
result_df = lowest_price_multiple_df(df_list, 'per_unit_price')


In [89]:
df2.head(1)

Unnamed: 0,store,category,brand,product,price,sale_price,per_unit_price,sale_per_unit_price,units,is_sale,list_item,similarity,comparable_price
0,freshco,dairy_and_eggs,Neilson,2% Milk,3.083447,,0.309254,,100ml,0.0,2% milk,100.0,0.309254


In [99]:
df1 = pd.read_csv('search_output/valu_mart_results.csv')
df2 = pd.read_csv('search_output/freshco_results.csv')

df = df1.append(df2, ignore_index=True)

dfc = df.groupby('list_item')['comparable_price']

df = df.assign(min_cost=dfc.transform(min))

df = df[df['comparable_price'] == df['min_cost']]

df

Unnamed: 0.1,Unnamed: 0,store,category,brand,product,price,sale_price,per_unit_price,sale_per_unit_price,units,is_sale,list_item,similarity,comparable_price,min_cost
1,24.0,valu_mart,dairy_and_eggs,No Name,Old Cheddar Cheese,9.79,,1.4,,100g,0.0,Cheddar Cheese,66.666667,1.4,1.4
4,1432.0,valu_mart,produce,,Clementines,6.99,,0.39,,100g,0.0,clementines,100.0,0.39,0.39
5,,freshco,dairy_and_eggs,Neilson,2% Milk,3.083447,,0.309254,,100ml,0.0,2% milk,100.0,0.309254,0.309254
7,,freshco,bakery,Wonder,"Bread, Sliced White",3.418239,,0.50507,,100g,0.0,white sliced bread,75.0,0.50507,0.50507
8,,freshco,meat,,Lean Ground Beef,6.96421,,1.392842,,100g,0.0,ground beef,66.666667,1.392842,1.392842
9,,freshco,meat,Zabiha Halal,Chicken Breast Strips,13.918471,,1.741052,,100g,0.0,chicken breast,66.666667,1.741052,1.741052


In [37]:
from nltk.metrics.distance import jaccard_distance

jaccard_distance(set('Natrel 2% milk 4 L'.lower().split(' ')), set('2% milk'.lower().split(' ')))


0.6

In [21]:
set('Natrel 2% milk 4 L'.lower().split(' '))

{'2%', '4', 'l', 'milk', 'natrel'}

In [35]:
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

jaccard_similarity('Natrel 2% milk 4 L'.lower().split(' '), '2% milk'.lower().split(' '))

0.4

In [24]:
! pip install fuzzywuzzy

Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [41]:
def jaccard_similarity(string1, string2):
    similarity = jaccard_distance(set(string1.lower()), set(string2.lower()))
    return int(similarity * 100)

jaccard_similarity('natrel 2% milk 4 L', 'milk 2%')

46

In [62]:
from fuzzywuzzy import process, fuzz
# import fuzzywuzzy as fuzz
import pandas as pd
from nltk.stem import PorterStemmer
ps = PorterStemmer()

df = pd.read_csv('clean_data/zehrs/zehrs_data.csv')

def jaccard_similarity(string1, string2):
    return (1 - jaccard_distance(set(string1.lower().split(' ')), set(string2.lower().split(' ')))) * 100


# Define the string to compare against
query = "2% milk"

# Use the process.extract() function to find the most similar items
results = process.extract(query, df['product'], scorer=jaccard_similarity, limit=30)

idxs = []
for item in results: 
    if item[1] >= 52: 
        idxs.append(item[2])

df = df.iloc[idxs]

import numpy as np 

df["comparable_price"] = np.where(df["is_sale"] == True, df["sale_price"], df["price"])

df


Unnamed: 0.1,Unnamed: 0,store,category,brand,product,price,sale_price,per_unit_price,sale_per_unit_price,units,is_sale,comparable_price
283,283,zehrs,dairy_and_eggs,Neilson,2% Milk,3.39,,0.34,,100ml,False,3.39
347,347,zehrs,dairy_and_eggs,Neilson,2% Milk,2.09,,0.88,,100ml,False,2.09
394,394,zehrs,dairy_and_eggs,Neilson,2% Milk,4.99,,0.25,,100ml,False,4.99
474,474,zehrs,dairy_and_eggs,Neilson,2% Milk,2.59,,0.55,,100ml,False,2.59
258,258,zehrs,dairy_and_eggs,Neilson,TruTaste 2% Milk,4.09,,0.41,,100ml,False,4.09
553,553,zehrs,dairy_and_eggs,PC Organics,Organic 2% Milk,4.59,,0.46,,100ml,False,4.59
68,68,zehrs,dairy_and_eggs,Rolling Meadow,Grass Fed 2% Milk,6.29,,0.31,,100ml,False,6.29
934,934,zehrs,dairy_and_eggs,Neilson,"True Taste, 2% Milk",4.99,,0.25,,100ml,False,4.99


In [51]:
results

[('2% Milk', 100.0, 283),
 ('2% Milk', 100.0, 347),
 ('2% Milk', 100.0, 394),
 ('2% Milk', 100.0, 474),
 ('TruTaste 2% Milk', 75.0, 258),
 ('Organic 2% Milk', 75.0, 553),
 ('Grass Fed 2% Milk', 60.0, 68),
 ('True Taste, 2% Milk', 60.0, 934),
 ('TruTaste Lactose Free 2% Milk', 50.0, 282),
 ('Organic Partly Skimmed 2% Milk', 50.0, 321),
 ('Organic Partly Skimmed 2% Milk', 50.0, 322),
 ('1% Milk', 50.0, 473),
 ('Trutaste Lactose Free 2% Milk', 50.0, 569),
 ('Trutaste Lactose Free 2% Milk', 50.0, 612),
 ('2% Lactose Free Enriched Milk', 50.0, 662),
 ('2% Lactose Free Chocolate Enriched Milk', 42.85714285714286, 245),
 ('2% M.F. Partly Skimmed Milk', 42.85714285714286, 953),
 ('Flavoured Milk, Chocolate', 40.0, 92),
 ('TruTaste 1% Milk', 40.0, 259),
 ('Flavoured Milk, Strawberry', 40.0, 351),
 ('Yogurt, Plain 2%', 40.0, 385),
 ('Flavoured Milk, Chocolate', 40.0, 475),
 ('TruTaste 1% Milk', 40.0, 614),
 ('TruTaste 1% Milk', 40.0, 708),
 ('Flavoured Milk, Chocolate', 40.0, 859),
 ('Milk, Ultr

In [107]:
from itertools import combinations

def item_selection(df1, df2):
    df = df1.append(df2, ignore_index=True)
    dfc = df.groupby('list_item')['comparable_price']
    df = df.assign(min_cost=dfc.transform(min))
    df = df[df['comparable_price'] == df['min_cost']]

    per_unit_subtotal = sum(df['comparable_price'])

    return df, per_unit_subtotal

results = {}

def n_store_selection(n, results_dict):
    # if n = 1:

    possibilities = list(combinations(results_dict.keys(), n))
    for combin in possibilities:
        df1 = pd.read_csv(f'search_output/{combin[0]}_results.csv')
        df2 = pd.read_csv(f'search_output/{combin[1]}_results.csv')

        optimal_selection, per_unit_subtotal = item_selection(df1, df2)

        results[combin] = per_unit_subtotal

    return results

n_store_selection(3, results_dict)

{('zehrs', 'no_frills', 'valu_mart'): 4.36,
 ('zehrs', 'no_frills', 'sobeys'): 6.603324443831867,
 ('zehrs', 'no_frills', 'freshco'): 6.0631486721469505,
 ('zehrs', 'valu_mart', 'sobeys'): 6.695622226465402,
 ('zehrs', 'valu_mart', 'freshco'): 5.738218348093484,
 ('zehrs', 'sobeys', 'freshco'): 5.9037967803848135,
 ('no_frills', 'valu_mart', 'sobeys'): 6.695622226465402,
 ('no_frills', 'valu_mart', 'freshco'): 5.738218348093484,
 ('no_frills', 'sobeys', 'freshco'): 5.9037967803848135,
 ('valu_mart', 'sobeys', 'freshco'): 5.9037967803848135}

In [112]:
d = [1,2,3]
print(d[0])
d.remove(d[0])
d

1


[2, 3]

In [117]:
from itertools import combinations

def item_selection(dfs):
    df = dfs[0]
    dfs.remove(dfs[0])
    for df_n in dfs:
        df = df.append(df_n, ignore_index=True)
    # df = df1.append(df2, ignore_index=True)
    df_grp = df.groupby('list_item')['comparable_price']
    df = df.assign(min_cost=df_grp.transform(min))
    df = df[df['comparable_price'] == df['min_cost']]

    per_unit_subtotal = sum(df['comparable_price'])

    return df, per_unit_subtotal

results = {}

def n_store_selection(n, results_dict):
    # if n = 1:

    possibilities = list(combinations(results_dict.keys(), n))
    for combin in possibilities:
        dfs = []
        for store in combin:
            df = pd.read_csv(f'search_output/{store}_results.csv')
            dfs.append(df)

        optimal_selection, per_unit_subtotal = item_selection(dfs)

        results[combin] = per_unit_subtotal

    return results

n_store_selection(1, results_dict)

{('zehrs',): 6.199999999999999,
 ('no_frills',): 4.53,
 ('valu_mart',): 18.11,
 ('sobeys',): 7.2838562670982165,
 ('freshco',): 5.9037967803848135}

In [118]:
len(results_dict)

5

In [2]:
import pandas as pd

In [9]:
df = pd.read_csv('search_output/zehrs_results.csv')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,store,category,brand,product,price,sale_price,per_unit_price,sale_per_unit_price,units,is_sale,list_item,similarity,comparable_PUP,comparable_price
0,zehrs,dairy_and_eggs,Neilson,2% Milk,3.39,,0.34,,100ml,0.0,2% milk,100.0,0.34,3.39
1,zehrs,dairy_and_eggs,No Name,Medium Cheddar Cheese,4.29,,2.15,,100g,0.0,Cheddar Cheese,66.666667,2.15,4.29
2,zehrs,bakery,Wonder,"Bread, Sliced White",3.79,,0.56,,100g,0.0,white sliced bread,75.0,0.56,3.79


In [13]:
df.to_dict('list')

{'store': ['zehrs', 'zehrs', 'zehrs'],
 'category': ['dairy_and_eggs', 'dairy_and_eggs', 'bakery'],
 'brand': ['Neilson', 'No Name', 'Wonder'],
 'product': ['2% Milk', 'Medium Cheddar Cheese', 'Bread, Sliced White'],
 'price': [3.39, 4.29, 3.79],
 'sale_price': [nan, nan, nan],
 'per_unit_price': [0.34, 2.15, 0.56],
 'sale_per_unit_price': [nan, nan, nan],
 'units': ['100ml', '100g', '100g'],
 'is_sale': [0.0, 0.0, 0.0],
 'list_item': ['2% milk', 'Cheddar Cheese', 'white sliced bread'],
 'similarity': [100.0, 66.66666666666667, 75.0],
 'comparable_PUP': [0.34, 2.15, 0.56],
 'comparable_price': [3.39, 4.29, 3.79]}

In [10]:
import pandas as pd

stores = ['walmart', 'freshco', 'sobeys', 'food_basics']

store = 'freshco'
# for store in stores:
flipp = pd.read_csv(f'clean_data/{store}/flyer_deals.csv')
synth = pd.read_csv(f'clean_data/{store}/synthetic_data.csv')

all_data = flipp.append(synth)


all_data.head()
# all_data.to_csv(f'clean_data/{store}/{store}_data.csv', index=False)

In [None]:
all_data.tail()

In [7]:
from itertools import combinations, chain

# list(itertools.chain.from_iterable(itertools.combinations(lst, r) for r in range(1, len(lst)+1)))

stores = ['freshco', 'no_frills', 'zehrs', 'sobeys']
n = 2

list(chain.from_iterable(combinations(stores, r) for r in range(1, n+1)))

[('freshco',),
 ('no_frills',),
 ('zehrs',),
 ('sobeys',),
 ('freshco', 'no_frills'),
 ('freshco', 'zehrs'),
 ('freshco', 'sobeys'),
 ('no_frills', 'zehrs'),
 ('no_frills', 'sobeys'),
 ('zehrs', 'sobeys')]

In [18]:
import re 

brands = ['Neilson', 'IOGO', 'No Name']

strs = ['NeilsonPartly Skimmed Milk 2% MF4 l'
        , 'IOGONano Drinkable Yogurt, Strawberry6x93.0 ml'
        , 'No NameShredded Nacho Cheese Blend320 g']

for i in range(len(strs)):
    # add space after brand 
    s = strs[i].replace(brands[i], f'{brands[i]} ')
    
    # search for all numbers with letter directly in front 
    find_num = re.search('(?<=\w)\d', s)
    # index of that pattern match 
    idx = find_num.start()
    # split on the pattern index 
    s = s[:idx] + ' ' + s[idx:]
    print(s)


    

Neilson Partly Skimmed Milk 2% MF 4 l
IOGO Nano Drinkable Yogurt, Strawberry 6x93.0 ml
No Name Shredded Nacho Cheese Blend 320 g


In [27]:
import pandas as pd

study = pd.read_csv('empirical study.csv')
study.head()

Unnamed: 0,Item,Sobeys,FreshCo,Food Basics,Walmart,zehrs (found online),sobeys % difference,freshco % difference,food_basics % difference,walmart % difference,department
0,bananas,0.69,0.59,0.59,0.58,0.69,1.0,0.855072,0.855072,0.84058,produce
1,ambrosia apple,2.99,2.49,2.48,1.97,2.99,1.0,0.832776,0.829431,0.658863,produce
2,broccoli,3.99,2.49,2.88,2.97,2.99,1.334448,0.832776,0.963211,0.993311,produce
3,regular tomatoes,2.49,1.99,3.48,2.97,2.49,1.0,0.799197,1.39759,1.192771,produce
4,romaine lettuce (single),1.99,1.79,1.99,2.33,3.49,0.570201,0.512894,0.570201,0.667622,produce


In [47]:
# scipy.stats.gmean(array, axis=0, dtype=None)
from scipy import stats

departments  = study.department.unique()

stores = ['freshco', 'sobeys', 'food_basics', 'walmart']

factors = pd.DataFrame()

for department in departments:
    df = study[study['department'] == department]

    for store in stores: 
        geo_mean = stats.gmean(df[f'{store} % difference'], axis=0, dtype=None)
        # print(store, department, geo_mean)
        if department == 'meat + seafood':
            factors = factors.append({'store':store, 'department':'meat', 'geo_mean':geo_mean}, ignore_index=True)
            factors = factors.append({'store':store, 'department':'seafood', 'geo_mean':geo_mean}, ignore_index=True)
        elif department == 'home goods':
            factors = factors.append({'store':store, 'department':'household_items', 'geo_mean':geo_mean}, ignore_index=True)
        elif department == 'dairy':
            factors = factors.append({'store':store, 'department':'dairy_and_eggs', 'geo_mean':geo_mean}, ignore_index=True)
        elif department == 'dry goods':
            factors = factors.append({'store':store, 'department':'pantry', 'geo_mean':geo_mean}, ignore_index=True)
            factors = factors.append({'store':store, 'department':'snacks', 'geo_mean':geo_mean}, ignore_index=True)
            factors = factors.append({'store':store, 'department':'drinks', 'geo_mean':geo_mean}, ignore_index=True)
            factors = factors.append({'store':store, 'department':'frozen', 'geo_mean':geo_mean}, ignore_index=True)
        else:
            factors = factors.append({'store':store, 'department':department, 'geo_mean':geo_mean}, ignore_index=True)



  log_a = np.log(np.array(a, dtype=dtype))


In [48]:
factors

Unnamed: 0,department,geo_mean,store
0,produce,0.753613,freshco
1,produce,0.946817,sobeys
2,produce,0.885488,food_basics
3,produce,0.847831,walmart
4,meat,0.904069,freshco
5,seafood,0.904069,freshco
6,meat,1.066879,sobeys
7,seafood,1.066879,sobeys
8,meat,1.048851,food_basics
9,seafood,1.048851,food_basics


In [43]:
d = factors[factors['department'] == 'meat + seafood']
d['department'] = ['meat']*len(d)

d = d.append({'department':['seafood']*len(d), 'store':d.store, 'geo_mean':d.geo_mean}, ignore_index=True)
d


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['department'] = ['meat']*len(d)


Unnamed: 0,department,geo_mean,store
0,meat,0.904069,freshco
1,meat,1.06688,sobeys
2,meat,1.04885,food_basics
3,meat,0.99681,walmart
4,"[seafood, seafood, seafood, seafood]",4 0.904069 5 1.066879 6 1.048851 7 ...,4 freshco 5 sobeys 6 food_ba...


In [38]:
factors

Unnamed: 0,department,geo_mean,store
0,produce,0.753613,freshco
1,produce,0.946817,sobeys
2,produce,0.885488,food_basics
3,produce,0.847831,walmart
4,meat + seafood,0.904069,freshco
5,meat + seafood,1.066879,sobeys
6,meat + seafood,1.048851,food_basics
7,meat + seafood,0.99681,walmart
8,bakery,0.862581,freshco
9,bakery,1.08814,sobeys


In [81]:
f = '2023-02-16T00:00:00-05:00'
l = 'Ends 02/22'

l = l.split('Ends ')[1]

from datetime import datetime

dt_obj = datetime.strptime(f, "%Y-%m-%dT%H:%M:%S%z")
formatted_date = dt_obj.strftime("%m/%d")

formatted_date, l


('02/16', '02/22')

In [1]:
# IMPORTS 
import pandas as pd
import time 
from bs4 import BeautifulSoup
import scrapers.scraper as scraper

# variable store 
store_vars = { 
                'zehrs_produce':{'store':'zehrs',
                    'category_name':'produce',
                    'link':'https://www.zehrs.ca/food/fruits-vegetables/c/28000?navid=flyout-L2-fruits-vegetables',
                    'load_more_xpath':'//*[@id="site-content"]/div/div/div[6]/div/div[2]/div[4]/div/button'}
              }

# START RUN 
start_time = time.time()


# SET OPTIONS 
options = scraper.set_options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')

# STARTUP DRIVER 
driver = scraper.initiate_driver(options)

# run each store for each department 
for store_prod in store_vars.keys():
    print(f'Starting {store_prod}')

    store_details = store_vars[store_prod]

    try:
        scraper.nav_to_page(driver, store_details['link'])
    except:
        continue

    time.sleep(8) # wait for page load 

    scraper.close_cookies_blocker(driver, '//*[@id="privacy-policy"]/div/div/button')

    time.sleep(5) # wait for page load 

    scraper.click_load_more(driver, store_details['load_more_xpath'])

    time.sleep(5) # wait for page load 

    scraper.check_that_bottom_is_reached(driver, store_details['load_more_xpath'])

    # extract html
    page_source = scraper.grab_html(driver)
    soup = BeautifulSoup(page_source, 'lxml')

    # find all product divs for that page 
    product_divs = soup.find_all("div", {"class": "product-tracking"})

Starting zehrs_produce
	 closed cookie blocker
	 1 page(s) loaded
	 more to load


In [3]:
len(product_divs)

96

In [4]:
df = pd.DataFrame(columns = ['category', 'brand', 'product_text', 'product_name', 'price_text', 'sale_price', 'per_unit_price_text', 'is_sale'])


In [19]:
for div in product_divs:
    prod_details = div.find_all("div", {"class": "product-tile__details"})
    
    name = prod_details[0].find_all("h3", {"class": "text text--small4 text--left text--default-color product-tile__details__info__name"})[0].text

    if 'roma ' in name.lower():
        try: # ex: fruit doesnt have a brand so allow pass 
            brand = prod_details[0].find_all("span", {"class": "product-name__item product-name__item--brand"})[0].text
        except:
            brand = ''

        product = prod_details[0].find_all("span", {"class": "product-name__item product-name__item--name"})[0].text
        
        prod_details2 = prod_details[0].find_all("div", {"class": "product-tile__details__info__section"})
        
        prod_info = prod_details[0].find_all("div", {"class": "product-prices product-prices--product-tile"})

        if prod_info == []: # household items follow diff div format 
            prod_info = prod_details[0].find_all("div", {"class": "selling-price-list selling-price-list--product-tile"})

In [20]:
product

'Roma Tomatoes'

In [29]:
prod_details2[0]

<div class="product-tile__details__info__section"><div class="product-tile-deal-badge"><div class="product-badge__icon product-badge__icon--sale product-badge__icon--product-tile"><div class="product-badge__icon__text product-badge__icon__text--sale">sale</div><div class="product-badge__icon__expiry product-badge__icon__expiry--sale">Ends 02/22</div></div></div><div class="product-tile__details__info__text-badge"><div class="product-badge__text product-badge__text--product-tile">SAVE $0.18</div></div><div class="product-prices product-prices--product-tile"><div aria-label="$0.53 each, it was $0.71" class="selling-price-list selling-price-list--sale selling-price-list--product-tile" role="group"><div class="selling-price-list__item"><span class="price selling-price-list__item__price selling-price-list__item__price--now-price"><span class="price__value selling-price-list__item__price selling-price-list__item__price--now-price__value">$0.53</span><span class="price__type selling-price-lis

In [32]:
 # if there is a was-price tag then it is on sale currently
# try: 
valid_to = prod_details2[0].find_all("div", {"class": "product-tile-deal-badge"})[0].text
print(valid_to)

saleEnds 02/22


In [34]:
# all_badge = sale_info[0].find_all("span", {"class": "product-badge__icon product-badge__icon--sale product-badge__icon--product-tile"})[0].text
# valid_to = all_badge[0].find_all("span", {"class": "product-badge__icon__expiry product-badge__icon__expiry--sale"})[0].text

price_text = prod_info[0].find_all("span", {"class": "price selling-price-list__item__price selling-price-list__item__price--was-price"})[0].text
sale_price = prod_info[0].find_all("span", {"class": "price selling-price-list__item__price selling-price-list__item__price--now-price"})[0].text
# sale = True

# print(product)
# except:
#     print('nope')

In [36]:
price_text,sale_price

('$0.71(est.)ea', '$0.53(est.)ea')

In [41]:
from datetime import date


today = date.today()
formatted_date = today.strftime("%m/%d/%Y")
print(formatted_date)  


02/20/2023


In [42]:
import pandas as pd
from scipy import stats
import random

stores = ['freshco', 'sobeys', 'food_basics', 'walmart']

def generate_scaling_factors(study_data):
    '''geometric mean of % diff values to be our _scaling_factor_'''

    departments = study_data.department.unique()

    factors = pd.DataFrame()

    for department in departments:
        df = study_data[study_data['department'] == department]

        for store in stores: 

            geo_mean = stats.gmean(df[f'{store} % difference'], axis=0, dtype=None)

            if department == 'meat + seafood':
                factors = factors.append({'store':store, 'department':'meat', 'geo_mean':geo_mean}, ignore_index=True)
                factors = factors.append({'store':store, 'department':'seafood', 'geo_mean':geo_mean}, ignore_index=True)
            elif department == 'home goods':
                factors = factors.append({'store':store, 'department':'household_items', 'geo_mean':geo_mean}, ignore_index=True)
            elif department == 'dairy':
                factors = factors.append({'store':store, 'department':'dairy_and_eggs', 'geo_mean':geo_mean}, ignore_index=True)
            elif department == 'dry goods':
                factors = factors.append({'store':store, 'department':'pantry', 'geo_mean':geo_mean}, ignore_index=True)
                factors = factors.append({'store':store, 'department':'snacks', 'geo_mean':geo_mean}, ignore_index=True)
                factors = factors.append({'store':store, 'department':'drinks', 'geo_mean':geo_mean}, ignore_index=True)
                factors = factors.append({'store':store, 'department':'frozen', 'geo_mean':geo_mean}, ignore_index=True)
            else:
                factors = factors.append({'store':store, 'department':department, 'geo_mean':geo_mean}, ignore_index=True)

    return factors 


study = pd.read_csv('empirical study.csv')
baseline = pd.read_csv('clean_data/zehrs/zehrs_data.csv')

factors = generate_scaling_factors(study)

scaling_factors = {}


for store in stores:
    globals()[f"{store}_synthetic_data"] = pd.DataFrame()
    scaling_factors[store] = {}
    for category in factors.department.unique():
        category_data = baseline[baseline['category'] == category]

        columns = category_data.columns

        geo_mean = list(factors.loc[ (factors['store']==store) & (factors['department'] == category) ].geo_mean)[0]
        noise = random.uniform(0, 0.05)
        scaling_factor = geo_mean+noise
        # print(store, noise, geo_mean, scaling_factor)
        
        scaling_factors[store][category] = scaling_factor

        # print(scaling_factor)
        # print(category_data['price'])/
        # print(category_data['price'], category_data['price']*scaling_factor)

        category_data['price'] = category_data['price']*scaling_factor
        category_data['per_unit_price'] = category_data['per_unit_price']*scaling_factor
        category_data['sale_price'] = [None]*len(category_data)
        category_data['sale_per_unit_price'] = [None]*len(category_data)
        category_data['store'] = [store]*len(category_data)
        category_data['is_sale'] = [False]*len(category_data)

        category_data = category_data[columns]

        globals()[f"{store}_synthetic_data"] = globals()[f"{store}_synthetic_data"].append(category_data, ignore_index=True)
    
    # globals()[f"{store}_synthetic_data"].pop(globals()[f"{store}_synthetic_data"].columns[0])
    # globals()[f"{store}_synthetic_data"].to_csv(f'clean_data/{store}/synthetic_data.csv', index=False)





freshco 0.002183629149207722 0.7536125363840291 0.7557961655332368
2074    1.75
2075    1.99
2076    4.99
2077    1.99
2078    8.98
        ... 
2754    7.99
2755    5.99
2756    5.99
2757    9.99
2758    3.95
Name: price, Length: 685, dtype: float64 2074    1.322643
2075    1.504034
2076    3.771423
2077    1.504034
2078    6.787050
          ...   
2754    6.038811
2755    4.527219
2756    4.527219
2757    7.550404
2758    2.985395
Name: price, Length: 685, dtype: float64
freshco 0.02048568995446682 0.9040689227396467 0.9245546126941135
1597    12.00
1598    35.77
1599     8.00
1600     8.79
1601    15.00
        ...  
2069    15.99
2070     7.99
2071    13.99
2072    16.99
2073     5.99
Name: price, Length: 477, dtype: float64 1597    11.094655
1598    33.071318
1599     7.396437
1600     8.126835
1601    13.868319
          ...    
2069    14.783628
2070     7.387191
2071    12.934519
2072    15.708183
2073     5.538082
Name: price, Length: 477, dtype: float64
freshco 0.00070151541

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['price'] = category_data['price']*scaling_factor
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['per_unit_price'] = category_data['per_unit_price']*scaling_factor
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['sale_price'] = [None]*len(category_data)
A val

sobeys 0.02153488684395884 1.0668788370193973 1.0884137238633562
2759    10.99
2760    20.23
2761     8.43
2762    11.99
2763     7.99
        ...  
2897     3.99
2898    13.99
2899     9.99
2900     8.99
2901    13.99
Name: price, Length: 143, dtype: float64 2759    11.961667
2760    22.018610
2761     9.175328
2762    13.050081
2763     8.696426
          ...    
2897     4.342771
2898    15.226908
2899    10.873253
2900     9.784839
2901    15.226908
Name: price, Length: 143, dtype: float64
sobeys 0.0389914784402052 1.0881400251080986 1.1271315035483038
990     4.29
991     4.29
992     4.29
993     5.99
994     3.09
        ... 
1592    3.29
1593    3.79
1594    4.49
1595    6.99
1596    3.99
Name: price, Length: 607, dtype: float64 990     4.835394
991     4.835394
992     4.835394
993     6.751518
994     3.482836
          ...   
1592    3.708263
1593    4.271828
1594    5.060820
1595    7.878649
1596    4.497255
Name: price, Length: 607, dtype: float64
sobeys 0.0394311319826505