DIGINETICA

https://competitions.codalab.org/competitions/11161#learn_the_details-evaluation

In [None]:
# This is sample baseline for CIKM Personalization Cup 2016
# by Alexander Laktionov & Vladislav Grozin

import numpy as np
import pandas as pd
import datetime

start_time = datetime.datetime.now()
print("Running baseline. Now it's", start_time.isoformat())

# Loading queries (assuming data placed in <dataset-train/>
queries = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-queries.csv', sep=';')[['queryId', 'items', 'is.test']]
print('Total queries', len(queries))

# Leaving only test queries (the ones which items we have to sort)
queries = queries[queries['is.test'] == True][['queryId', 'items']]
print('Test queries', len(queries))
queries.reset_index(inplace=True)
queries.drop(['index'], axis=1, inplace=True)

# Loading item views; taking itemId column
item_views = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-item-views.csv', sep=';')[['itemId']]
print('Item views', len(item_views))

# Loading clicks; taking itemId column
clicks = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-clicks.csv', sep=';')[['itemId']]
print('Clicks', len(clicks))

# Loading purchases; taking itemId column
purchases = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-purchases.csv', sep=';')[['itemId']]
print('Purchases', len(purchases))

# Calculating popularity as [Amount of views] * 1 + Amount of clicks * 2 + [Amount of purchases] * 3
print('Scoring popularity for each item ...')
prod_pop = {}
for cost, container in enumerate([item_views, clicks, purchases]):
    for prod in container.values:
        product = str(prod[0])
        if product not in prod_pop:
            prod_pop[product] = cost
        else:
            prod_pop[product] += cost

print('Popularity scored for', len(prod_pop), 'products')

# For each query:
#   parse items (comma-separated values in last column)
#   sort them by score;
#   write them to the submission file.
# This is longest part; it usually takes around 5 minutes.
print('Sorting items per query by popularity...')

answers = []
step = int(len(queries) / 20)

with open('submission.txt', 'w+') as submission:
    for i, q in enumerate(queries.values):

        # Fancy progressbar
        if i % step == 0:
            print(5 * i / step, '%...')

        # Splitting last column which contains comma-separated items
        items = q[-1].split(',')
        # Getting scores for each item. Also, inverting scores here, so we can use argsort
        items_scores = list(map(lambda x: -prod_pop.get(x, 0), items))
        # Sorting items using items_scores order permutation
        sorted_items = np.array(items)[np.array(items_scores).argsort()]
        # Squashing items together
        s = ','.join(sorted_items)
        # and writing them to submission
        submission.write(str(q[0]) + " " + s + "\n")

end_time = datetime.datetime.now()
print("Done. Now it's ", end_time.isoformat())
print("Calculated baseline in ", (end_time - start_time).seconds, " seconds")


In [1]:
import pandas as pd
import numpy as np
import util

In [121]:
q = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-queries.csv', sep=';')
v = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-item-views.csv', sep=';')
c = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-clicks.csv', sep=';')
p = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-purchases.csv', sep=';')
products = pd.read_csv('./data/diginetica/dataset-train-diginetica/products.csv', sep=';')
product_cat = pd.read_csv('./data/diginetica/dataset-train-diginetica/product-categories.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [122]:
print(f'q:           {list(q.columns)}')
print(f'v:           {list(v.columns)}')
print(f'c:           {list(c.columns)}')
print(f'p:           {list(p.columns)}')
print(f'products:    {list(products.columns)}')
print(f'product_cat: {list(product_cat.columns)}')


q:           ['queryId', 'sessionId', 'userId', 'timeframe', 'duration', 'eventdate', 'searchstring.tokens', 'categoryId', 'items', 'is.test']
v:           ['sessionId', 'userId', 'itemId', 'timeframe', 'eventdate']
c:           ['queryId', 'timeframe', 'itemId']
p:           ['sessionId', 'userId', 'timeframe', 'eventdate', 'ordernumber', 'itemId']
products:    ['itemId', 'pricelog2', 'product.name.tokens']
product_cat: ['itemId', 'categoryId']


In [123]:
qq = q.iloc[:10_000].copy()

In [124]:
# query items
qi = []
for _, row in qq.iterrows():
    qi += [[row['userId'], row['sessionId'], row['queryId'], i, int(item)] for i, item in enumerate(row['items'].split(','))]
qi = pd.DataFrame(qi, columns=['userId', 'sessionId', 'queryId', 'position', 'itemId'])
# clicks
qi = pd.merge(qi, c, how='left', on=['queryId', 'itemId'])
qi = qi.rename(columns={'timeframe': 'clicked'})
qi.loc[pd.notna(qi['clicked']), 'clicked'] = 1
qi.loc[pd.isna(qi['clicked']), 'clicked'] = 0
# purchases
qi = pd.merge(qi, p[['userId', 'sessionId', 'itemId', 'timeframe']], how='left', on=['userId', 'sessionId', 'itemId'])
qi = qi.rename(columns={'timeframe': 'purchased'})
qi.loc[pd.notna(qi['purchased']), 'purchased'] = 1
qi.loc[pd.isna(qi['purchased']), 'purchased'] = 0
# product prices
qi = pd.merge(qi, products[['itemId', 'pricelog2']], how='left', on=['itemId'])
qi = qi.rename(columns={'pricelog2': 'price'})
# totals
totals = qi.groupby('queryId')[['clicked', 'purchased']].sum().reset_index().rename(columns={'clicked':'clicked_total', 'purchased':'purchased_total', })
qi = qi.merge(totals, on='queryId')

In [132]:
len(qi[~no_clicks].groupby('queryId'))

6462

In [133]:
qi.groupby('position')['c_ap_cumsum'].sum() / len(qi[~no_clicks].groupby('queryId'))

position
0     0.050139
1     0.013734
2     0.007015
3     0.003962
4     0.002848
5     0.001690
6     0.001431
7     0.001506
8     0.001103
9     0.001040
10    0.000838
11    0.000532
12    0.000426
13    0.000371
14    0.000511
15    0.000287
16    0.000335
17    0.000420
18    0.000251
19    0.000224
Name: c_ap_cumsum, dtype: float64

In [136]:
# 6.2.2
no_clicks = qi['clicked_total'] == 0
no_purchases = qi['purchased_total'] == 0
qi_groupby_cumsum = qi.groupby('queryId').cumsum()
metrics = pd.DataFrame()

qi.loc[no_clicks, 'c_map'] = 0
qi.loc[no_clicks, 'c_ndcg'] = 0
qi.loc[~no_clicks, 'cum_clicks'] = qi.groupby('queryId')['clicked'].cumsum()
qi.loc[~no_clicks, 'c_ap'] = (qi['cum_clicks'] / (qi['position'] + 1))
qi.loc[~no_clicks, 'c_ap_cumsum'] = (1 / (qi['position'] + 1)) * qi.groupby('queryId')['c_ap'].cumsum()
qi.loc[(~no_clicks) & (qi['clicked'] == 0), 'c_ap_cumsum'] = 0
# qi.loc[~no_clicks, 'c_map'] = qi.groupby('queryId')['c_ap_cumsum'].cumsum()
metrics['c_map'] = qi.groupby('position')['c_ap_cumsum'].sum() / len(qi[~no_clicks].groupby('queryId'))

qi.loc[~no_clicks, 'c_dcg'] = (np.power(2, qi['cum_clicks']) - 1) / np.log2((qi['position'] + 1) + 1)
qi.loc[~no_clicks, 'c_dcg_cumsum'] = qi.groupby('queryId')['c_dcg'].cumsum()
qi.loc[~no_clicks, 'c_idcg'] = (np.power(2, (qi['position'] + 1)) - 1) / np.log2((qi['position'] + 1) + 1)
qi.loc[~no_clicks, 'c_idcg_cumsum'] = qi.groupby('queryId')['c_idcg'].cumsum()
qi.loc[~no_clicks, 'c_ndcg'] = qi['c_dcg_cumsum'] / qi['c_idcg_cumsum']
metrics['c_ndcg'] = qi.groupby('position')['c_ndcg'].sum() / len(qi[~no_clicks].groupby('queryId'))

# qi = qi.drop(columns=['c_ap', 'c_dcg', 'c_idcg', 'c_ap_cumsum', 'cum_clicks', 'c_dcg_cumsum', 'c_idcg_cumsum'])

qi.loc[no_purchases, 'g_map'] = 0
qi.loc[no_purchases, 'g_ndcg'] = 0
qi.loc[~no_purchases, 'cum_purchases'] = qi.groupby('queryId')['purchased'].cumsum()
qi.loc[~no_purchases, 'g_ap'] = (qi['cum_purchases'] / (qi['position'] + 1))
qi.loc[~no_purchases, 'g_ap_cumsum'] = (1 / (qi['position'] + 1)) * qi.groupby('queryId')['g_ap'].cumsum()
qi.loc[(~no_purchases) & (qi['purchased'] == 0), 'g_ap_cumsum'] = 0
# qi.loc[~no_purchases, 'g_map'] = qi.groupby('queryId')['g_ap_cumsum'].cumsum()
metrics['g_map'] = qi.groupby('position')['g_ap_cumsum'].sum() / len(qi[~no_purchases].groupby('queryId'))

qi.loc[~no_purchases, 'g_dcg'] = qi['price'] * (np.power(2, qi['cum_purchases']) - 1) / np.log2((qi['position'] + 1) + 1)
qi.loc[~no_purchases, 'g_dcg_cumsum'] = qi.groupby('queryId')['g_dcg'].cumsum()
qi.loc[~no_purchases, 'g_idcg'] = qi['price'] * (np.power(2, (qi['position'] + 1)) - 1) / np.log2((qi['position'] + 1) + 1)
qi.loc[~no_purchases, 'g_idcg_cumsum'] = qi.groupby('queryId')['g_idcg'].cumsum()
qi.loc[(~no_purchases) & (qi['price'] == 0), 'g_ndcg'] = 0
qi.loc[(~no_purchases) & (qi['price'] > 0), 'g_ndcg'] = qi['g_dcg'] / qi['g_idcg']
metrics['g_ndcg'] = qi.groupby('position')['g_ndcg'].sum() / len(qi[~no_purchases].groupby('queryId'))

# qi = qi.drop(columns=['g_ap', 'g_dcg', 'g_idcg', 'g_ap_cumsum', 'cum_purchases', 'g_dcg_cumsum', 'g_idcg_cumsum'])

# qi = qi.drop(columns=['userId', 'sessionId', 'clicked_total', 'purchased_total'])

In [139]:
metrics

Unnamed: 0_level_0,c_map,c_ndcg,g_map,g_ndcg
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.050139,0.049211,0.065217,0.06521739
1,0.013734,0.039309,0.01087,0.0326087
2,0.007015,0.030631,0.003623,0.01863354
3,0.003962,0.022665,0.004076,0.01304348
4,0.002848,0.016192,0.003043,0.007713885
5,0.00169,0.011083,0.003019,0.005866115
6,0.001431,0.007363,0.000665,0.003166724
7,0.001506,0.004828,0.00051,0.001662404
8,0.001103,0.003034,0.000671,0.0009572024
9,0.00104,0.001885,0.000543,0.0004993837


TypeError: 'numpy.int64' object is not callable

In [189]:
len(pi)

23504

In [198]:
# 5
pi = pd.DataFrame()
pi['n'] = qi.groupby('itemId').size()
pi[['clicked', 'purchased', 'price']] = qi.groupby('itemId')[['clicked', 'purchased', 'price']].mean()

pi.loc[pi['clicked'] > 0, 'l_ctr'] = np.log(pi['clicked'] / pi['n'])
pi.loc[pi['clicked'] == 0, 'l_ctr'] = 0
l_ctr = (-1 / len(pi)) * pi['l_ctr'].sum()

pi.loc[pi['purchased'] > 0, 'l_gmv'] = pi['price'] * np.log(pi['purchased'] / pi['n'])
# pi.loc[pi['purchased'] > 0, 'l_gmv'] = pi['price'] * np.log(pi['purchased'] / pi['n']) + np.log(pi['purchased'] / pi['clicked'])
pi.loc[pi['purchased'] == 0, 'l_gmv'] = 0
l_gmv = (-1 / len(pi)) * pi['l_gmv'].sum()

pi = pi.reset_index()
l_ctr, l_gmv

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


(0.6744283718288002, 0.16114042309066468)

In [230]:
##################################################################
#----------------------------------------------------------------#
#---------------------- LOADS DATASETS --------------------------#
#----------------------------------------------------------------#
##################################################################

# Loading queries (assuming data placed in <dataset-train/>
queries = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-queries.csv', sep=';')
queries["eventdate"] = pd.to_datetime(queries["eventdate"])

# Loading only the queries that have keyword search.
queries = queries[~queries["searchstring.tokens"].isnull()]

# Extract a mapping of each query and which items appeared
query_item = []
for query, items in queries[["queryId", "items"]].values:
    items = map(np.int64,items.split(','))
    for i in items:
        query_item.append( (query, i) )
query_item = pd.DataFrame().from_records(query_item, columns=["queryId","itemId"])

# Loading item views
item_views = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-item-views.csv', sep=';')
item_views.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId"], inplace=True)
print('Item views', len(item_views))

# Loading clicks
clicks = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-clicks.csv', sep=';')
clicks.sort_values(["queryId", "timeframe", "itemId"], inplace=True)
print('Clicks', len(clicks))

# Loading purchases
purchases = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-purchases.csv', sep=';')
print('Purchases', len(purchases))
purchases.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId", "ordernumber"], inplace=True)

# Loading products
products = pd.read_csv('./data/diginetica/dataset-train-diginetica/products.csv', sep=';')
print('Products', len(products))
products.sort_values(["itemId"], inplace=True)

# Loading product category
products_category = pd.read_csv('./data/diginetica/dataset-train-diginetica/product-categories.csv', sep=';')
print('Products Categories', len(products))
products_category.sort_values(["itemId"], inplace=True)

# Add info regarding sessionid
query_item = pd.merge(query_item, queries[["queryId", "sessionId"]], how="left")
query_item = pd.merge(query_item,  clicks, how="left")
query_item.rename(columns={"timeframe":"clickTime"}, inplace=True)
query_item = pd.merge(query_item,  item_views, how="left")

query_item.rename(columns={"eventdate":"eventdateView", "timeframe":"viewTime", "userId": "userView"}, inplace=True)
query_item = pd.merge(query_item, purchases, how="left")
query_item.rename(columns={"eventdate":"eventdatePurchase", "timeframe":"purchaseTime", "userId": "userPurchase"}, inplace=True)

"""
    'rank' is a value between 0 and 1, with 1 if the item is at the top of a list and 0 if this is the last value of a result list.
    Later we will calculate the division of this value by the number of items for each query and finally do 1 minus this value,
    such as the first position item will have rank value of 1, the second will have value of 1 - (1/N), where N is the number of
    items in a given result list.
"""
query_item["rank"] = 1
query_item["rank"] = query_item[["queryId","rank"]].groupby("queryId")["rank"].cumsum()
query_item["rank"] = query_item["rank"] - 1

items_per_query = query_item[["queryId","rank"]].groupby("queryId")["rank"].max()
items_per_query.name = "rank_size"

query_item = pd.merge(query_item, items_per_query.reset_index(), how="left")
query_item["rank"] = 1.0 - (query_item["rank"] / query_item["rank_size"])

# labels:
query_item["clicked"] = ~query_item["clickTime"].isnull()
query_item["viewed"] = ~query_item["viewTime"].isnull()
query_item["purchased"] = ~query_item["purchaseTime"].isnull()

# products info
products_info = pd.merge(query_item[["queryId", "itemId"]].drop_duplicates(), queries[["queryId", "searchstring.tokens"]], on="queryId", how="left").merge(products, on="itemId", how="left")


  queries = pd.read_csv('./data/diginetica/dataset-train-diginetica/train-queries.csv', sep=';')


Item views 1235380
Clicks 1127764
Purchases 18025
Products 184047
Products Categories 184047


In [231]:
query_item

Unnamed: 0,queryId,itemId,sessionId,clickTime,userView,viewTime,eventdateView,userPurchase,purchaseTime,eventdatePurchase,ordernumber,rank,rank_size,clicked,viewed,purchased
0,1,7518,1,,,,,,,,,1.000000,19,False,False,False
1,1,71,1,,,,,,,,,0.947368,19,False,False,False
2,1,30311,1,,,,,,,,,0.894737,19,False,False,False
3,1,7837,1,,,,,,,,,0.842105,19,False,False,False
4,1,30792,1,,,,,,,,,0.789474,19,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017757,53453,3084,41471,,,,,,,,,0.210526,19,False,False,False
1017758,53453,33440,41471,,,,,,,,,0.157895,19,False,False,False
1017759,53453,11265,41471,,,,,,,,,0.105263,19,False,False,False
1017760,53453,11842,41471,,,,,,,,,0.052632,19,False,False,False
