In [24]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import requests

In [2]:
data = pd.read_csv('./Junction-Kesko-Receipt-Data/Junction_data.csv', sep=';',
                  parse_dates=['TransactionDate'], dtype={'Quantity': 'float64'}, decimal=',')

In [3]:
data.shape

(52941708, 10)

In [4]:
data.head()

Unnamed: 0,AreaId,Receipt,TransactionDate,BeginHour,EAN,Quantity,PersonAgeGrp,KCustomer,QualClass,EasyClass
0,1,356601823178935,2017-11-07,17,5410103915654,1.0,55-64,6715,Q_1-3,E_4-7
1,1,356341113181337,2017-11-05,12,6413466126704,1.0,35-44,6712,Q_1-3,E_4-7
2,1,356629240622521,2017-11-07,18,24000017677,1.0,45-54,6715,Q_1-3,E_4-7
3,1,356269217607293,2017-11-04,13,6412000033188,2.0,55-64,6712,Q_4-7,E_4-7
4,1,356215448049286,2017-11-04,11,2000940900000,1.0,45-54,6713,Q_1-3,E_8-10


In [5]:
data.dtypes

AreaId                      int64
Receipt                     int64
TransactionDate    datetime64[ns]
BeginHour                   int64
EAN                         int64
Quantity                  float64
PersonAgeGrp               object
KCustomer                   int64
QualClass                  object
EasyClass                  object
dtype: object

In [6]:
data_young = data[(data['PersonAgeGrp'] == '18-24') & \
                  (data['QualClass'] == 'Q_1-3') & \
                  (data['EasyClass'] == 'E_8-10') & \
                  (data['AreaId'] == 2)]

In [7]:
data_old = data[(data['PersonAgeGrp'] == '25-34') & \
                (data['QualClass'] == 'Q_8-10') & \
                (data['EasyClass'] == 'E_1-3') & \
                (data['AreaId'] == 2)]

In [8]:
data_young.shape

(151085, 10)

In [9]:
data_young['Receipt'].nunique()

27679

In [10]:
data.groupby(['AreaId'])['Receipt'].nunique()

AreaId
1    1297364
2    2106076
3    1329669
4    1418564
Name: Receipt, dtype: int64

In [11]:
data_old.shape

(156153, 10)

In [12]:
data_old['Receipt'].nunique()

16828

In [13]:
data_young.groupby(['KCustomer'])['Receipt'].nunique()

KCustomer
6711    23590
6712     1174
6714     2915
Name: Receipt, dtype: int64

In [14]:
data_old.groupby(['KCustomer'])['Receipt'].nunique()

KCustomer
6711    7558
6712    6251
6713    2501
6714     518
Name: Receipt, dtype: int64

In [15]:
data_young.groupby(['KCustomer'])['EAN'].nunique()

KCustomer
6711    14985
6712     2256
6714     3102
Name: EAN, dtype: int64

In [16]:
data_young_eans = set(data_young['EAN'])
data_old_eans = set(data_old['EAN'])

In [17]:
len(data_young_eans | data_old_eans)

26010

In [18]:
eans = list(data_young_eans | data_old_eans)

In [19]:
eans_selected = [ean for ean in eans if not str(ean).startswith('2')]

In [20]:
ean_value_counts = data['EAN'].value_counts()

In [30]:
len(eans_selected)

23971

In [27]:
popular_eans = ean_value_counts.iloc[:15000].index
popular_eans = [ean for ean in popular_eans if not str(ean).startswith('2')]

In [26]:
len(popular_eans)

5512

In [31]:
ean_to_dict = {}

for ean in tqdm(eans_selected):
    response = requests.post(url='https://kesko.azure-api.net/v1/search/products',
                  headers={'Ocp-Apim-Subscription-Key': '50d5b4ef0f664d94b115f23c6da5b3e7',
                           'Content-Type': 'application/json'},
                  json={"filters": {
                            "ean": str(ean)
                        },
                        "view": {
                            "offset": 0,
                            "limit": 10,
                            "showFacets": {
                                "facets": [
                                    "string"
                                ],
                                "limit": 250
                            },
                            "showAvailability": {
                                "storeAvailability": [
                                    "A208",
                                    "A210",
                                    "A212",
                                    "A214",
                                    "A216",
                                    "A218",
                                    "A300",
                                    "A301",
                                    "A302",
                                    "A303",
                                    "A304",
                                    "A305",
                                    "A306",
                                    "A308",
                                    "A310",
                                    "A205",
                                    "A311",
                                    "A313",
                                    "A314"
                                ],
                                "webstoreAvailability": []
                            }
                        }
                       }
                 )
    ean_to_dict[str(ean)] = response.json()

HBox(children=(IntProgress(value=0, max=23971), HTML(value='')))




In [32]:
len(ean_to_dict)

23971

In [52]:
products_df = pd.DataFrame([{'ean': ean,
  'isAlcohol': json['results'][0]['isAlcohol'],
  'pictureUrl': json['results'][0]['pictureUrls'][0]['original'] if json['results'][0]['pictureUrls'] else None,
  'isConsumerGood': json['results'][0]['isConsumerGood'] if 'isConsumerGood' in json['results'][0] else None,
  'marketingName_finnish': json['results'][0]['marketingName']['finnish'] if 'finnish' in json['results'][0]['marketingName'] else None,
  'marketingName_english': json['results'][0]['marketingName']['english'] if 'english' in json['results'][0]['marketingName'] else None,
  'ingredients_finnish': json['results'][0]['attributes']['MATERIAL_U']['value']['value'] if 'MATERIAL_U' in json['results'][0]['attributes'] else None,
  'ingredients_english': json['results'][0]['attributes']['MATERIAL_E']['value']['value'] if 'MATERIAL_E' in json['results'][0]['attributes'] else None,
  'net_weight': json['results'][0]['measurements']['netWeight'] if 'measurements' in json['results'][0] else None,
  
                             
  'proteins': json['results'][0]['attributes']['PROTEG']['value']['value'] if 'PROTEG' in json['results'][0]['attributes'] else None,
  'fats': json['results'][0]['attributes']['RASVAA']['value']['value'] if 'RASVAA' in json['results'][0]['attributes'] else None,
  'carbohydrates': json['results'][0]['attributes']['HIHYDR']['value']['value'] if 'HIHYDR' in json['results'][0]['attributes'] else None,
  
  'kcal': json['results'][0]['attributes']['ENERKC']['value']['value'] if 'ENERKC' in json['results'][0]['attributes'] else None,
  'fats_saturated': json['results'][0]['attributes']['TYYDRH']['value']['value'] if 'TYYDRH' in json['results'][0]['attributes'] else None,
  'sugar': json['results'][0]['attributes']['SOKERI']['value']['value'] if 'SOKERI' in json['results'][0]['attributes'] else None,
                             
} for ean, json in ean_to_dict.items() if 'totalHits' in json and json['totalHits'] == 1])


In [53]:
products_df.notna().mean()

ean                      1.000000
isAlcohol                1.000000
pictureUrl               0.958707
isConsumerGood           0.994772
marketingName_finnish    0.996319
marketingName_english    0.575011
ingredients_finnish      0.846137
ingredients_english      0.257736
net_weight               0.999840
proteins                 0.696596
fats                     0.693502
carbohydrates            0.694889
kcal                     0.686780
fats_saturated           0.657810
sugar                    0.668961
dtype: float64

In [57]:
res = products_df['ingredients_finnish'].str.upper().str.extractall(r"(?P<e_number>\bE\d{3,4}\b)")
res['count'] = 1
res.set_index('e_number', append=True, drop=True, inplace=True)

res = res.unstack('e_number', fill_value=0)
res = res.groupby(level=0, as_index=True, group_keys=False).sum()
res = res.droplevel(0, axis=1)
res = res.sort_index(axis=1)

In [58]:
res.sum().sum()

13577

In [59]:
res.columns

Index(['E100', 'E101', 'E102', 'E104', 'E110', 'E1103', 'E1105', 'E120',
       'E1200', 'E124',
       ...
       'E951', 'E952', 'E954', 'E955', 'E957', 'E960', 'E965', 'E967', 'E995',
       'E999'],
      dtype='object', name='e_number', length=188)

In [60]:
products_with_e = pd.concat([products_df, res], axis=1)

In [61]:
products_with_e[res.columns] = products_with_e[res.columns].fillna(0).astype(int)

In [63]:
products_with_e.to_csv('products_v4.csv', index=False)