In [2]:
import pandas as pd
import numpy as np


queries = pd.read_csv('DATA/train-queries.csv', sep=';')
item_views = pd.read_csv('DATA/train-item-views.csv', sep=';')
clicks = pd.read_csv('DATA/train-clicks.csv', sep=';')
purchases = pd.read_csv('DATA/train-purchases.csv', sep=';')
products = pd.read_csv('DATA/products.csv', sep=';')
products_category = pd.read_csv('DATA/product-categories.csv', sep=';')

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Statistics OF DATASET :

query_full_queries_count  = len(queries[~queries["searchstring.tokens"].isnull()])
query_less_queries_count  = len(queries[queries["searchstring.tokens"].isnull()])
sessions_count = len(queries["sessionId"].unique())

#presented products 130,987  : not sure.

clicks_log_count = len(clicks)
views_log_count = len(item_views)
purchase_log_count  = len(purchases)

In [4]:
#User Statistics of Dataset

real_users_count = len(queries[~queries["userId"].isnull()]["userId"].unique())
anonymous_users_count = len(queries[queries["userId"].isnull()])
train_real_users_count = len(queries[queries["is.test"] == False][~queries["userId"].isnull()]["userId"].unique())
test_real_users_count = len(queries[queries["is.test"] == True][~queries["userId"].isnull()]["userId"].unique())

#intersection between test and trAIN USERS 

unique_train_users = queries[queries["is.test"] == False][~queries["userId"].isnull()]["userId"].unique()
uniquer_test_users = queries[queries["is.test"] == True][~queries["userId"].isnull()]["userId"].unique()
train_inter_test_usercount = len(np.intersect1d(unique_train_users,uniquer_test_users))

In [5]:
#print statistics


print("query_full_queries_count : ",query_full_queries_count)
print("query_less_queries_count : ",query_less_queries_count)
print("sessions_count : ",sessions_count)
print("clicks_log_count : ",clicks_log_count)
print("views_log_count : ",views_log_count)
print("purchase_log_count : ",purchase_log_count)

#print user statistics
print("\n")

print("real_users_count : ",real_users_count)
print("anonymous_users_count : ",anonymous_users_count)
print("train_real_users_count : ",train_real_users_count)
print("test_real_users_count : ",test_real_users_count)
print("train_inter_test_usercount : ",train_inter_test_usercount)

query_full_queries_count :  51888
query_less_queries_count :  871239
sessions_count :  573935
clicks_log_count :  1127764
views_log_count :  1235380
purchase_log_count :  18025


real_users_count :  232816
anonymous_users_count :  574887
train_real_users_count :  140387
test_real_users_count :  116630
train_inter_test_usercount :  24201


In [6]:
#PRE-PROCESSING FOR FEATURE EXTRACTION

queries = queries[~queries["searchstring.tokens"].isnull()]

# Extract a mapping of each query and which items appeared
query_item = []
for query, items in queries[["queryId", "items"]].values:
    items = map(np.int64,items.split(','))
    for i in items:
        query_item.append( (query, i) )
query_item = pd.DataFrame().from_records(query_item, columns=["queryId","itemId"])

item_views.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId"], inplace=True)
clicks.sort_values(["queryId", "timeframe", "itemId"], inplace=True)
purchases.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId", "ordernumber"], inplace=True)
products.sort_values(["itemId"], inplace=True)
products_category.sort_values(["itemId"], inplace=True)


query_item = pd.merge(query_item, queries[["queryId", "sessionId"]], how="left")
query_item = pd.merge(query_item,  clicks, how="left")
query_item.rename(columns={"timeframe":"clickTime"}, inplace=True)
query_item = pd.merge(query_item,  item_views, how="left")

query_item.rename(columns={"eventdate":"eventdateView", "timeframe":"viewTime", "userId": "userView"}, inplace=True)
query_item = pd.merge(query_item, purchases, how="left")
query_item.rename(columns={"eventdate":"eventdatePurchase", "timeframe":"purchaseTime", "userId": "userPurchase"}, inplace=True)


query_item["clicked"] = ~query_item["clickTime"].isnull()
query_item["viewed"] = ~query_item["viewTime"].isnull()
query_item["purchased"] = ~query_item["purchaseTime"].isnull()


user_item = []
for user, items in queries[["userId", "items"]].values:
    items = map(np.int64,items.split(','))
    for i in items:
        user_item.append( (user, i) )
user_item = pd.DataFrame().from_records(user_item, columns=["userShow","itemId"])
user_item = user_item[~user_item["userShow"].isnull()]

In [7]:
from functools import reduce

#Extracting global statistical features for each product

item_clicks = clicks.groupby("itemId").count().rename(columns = {"queryId" :"Item_clicks"})
product_views = item_views.groupby("itemId").count().rename(columns = {"sessionId" : "Item_view_count"})
item_purchases = purchases.groupby("itemId").count().rename(columns = {"sessionId" : "Item_purchase_count"})
item_show = query_item.groupby("itemId").count().rename(columns = {"queryId" : "item_show_count"})

userShow = user_item[["itemId","userShow"]].groupby("itemId")["userShow"].unique().apply(lambda x:len(x)).to_frame()
userView = pd.DataFrame(query_item[["itemId","userView"]].groupby("itemId")["userView"].unique().apply(lambda x:len(x)))
userPurchase = pd.DataFrame(query_item[["itemId","userPurchase"]].groupby("itemId")["userPurchase"].unique().apply(lambda x:len(x)))

dfs = [products[["itemId"]],item_show[["item_show_count"]],item_clicks[["Item_clicks"]],product_views[["Item_view_count"]],item_purchases[["Item_purchase_count"]],userShow,userView,userPurchase]
product_stats = reduce(lambda left,right: pd.merge(left,right,on='itemId',how="left"), dfs).sort_values(["itemId"]).reset_index().drop("index",axis = 1)

In [16]:
product_stats["CTR"] = product_stats["Item_clicks"]/product_stats["item_show_count"]
product_stats["View Rate"] = product_stats["Item_view_count"]/product_stats["item_show_count"]
product_stats["Click Value Rate"] = product_stats["Item_purchase_count"]/product_stats["item_show_count"]

In [48]:
def countwords(row):
    return row.count(",")+1

product_stats["wordLength"] = products["product.name.tokens"].apply(countwords)

In [51]:
product_stats  #features for each product

Unnamed: 0,itemId,item_show_count,Item_clicks,Item_view_count,Item_purchase_count,userShow,userView,userPurchase,CTR,View Rate,Click Value Rate,wordLength,wordlength
0,1,,,2.0,,,,,,,,6,6
1,2,3.0,36.0,41.0,2.0,2.0,1.0,1.0,12.000000,13.666667,0.666667,5,5
2,3,,34.0,22.0,,,,,,,,5,5
3,4,,,3.0,,,,,,,,5,5
4,6,1.0,13.0,9.0,,1.0,1.0,1.0,13.000000,9.000000,,4,4
5,7,63.0,75.0,70.0,,4.0,1.0,1.0,1.190476,1.111111,,4,4
6,9,14.0,51.0,53.0,,1.0,1.0,1.0,3.642857,3.785714,,6,6
7,10,6.0,1.0,3.0,,,1.0,1.0,0.166667,0.500000,,8,8
8,11,,,5.0,,,,,,,,6,6
9,12,41.0,190.0,193.0,6.0,3.0,1.0,1.0,4.634146,4.707317,0.146341,4,4
