In [21]:
import pandas as pd
import numpy as np


queries = pd.read_csv('DATA/train-queries.csv', sep=';')
item_views = pd.read_csv('DATA/train-item-views.csv', sep=';')
clicks = pd.read_csv('DATA/train-clicks.csv', sep=';')
purchases = pd.read_csv('DATA/train-purchases.csv', sep=';')
products = pd.read_csv('DATA/products.csv', sep=';')
products_category = pd.read_csv('DATA/product-categories.csv', sep=';')

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
#Statistics OF DATASET :

query_full_queries_count  = len(queries[~queries["searchstring.tokens"].isnull()])
query_less_queries_count  = len(queries[queries["searchstring.tokens"].isnull()])
sessions_count = len(queries["sessionId"].unique())

#presented products 130,987  : not sure.

clicks_log_count = len(clicks)
views_log_count = len(item_views)
purchase_log_count  = len(purchases)

In [24]:
#User Statistics of Dataset

real_users_count = len(queries[~queries["userId"].isnull()]["userId"].unique())
anonymous_users_count = len(queries[queries["userId"].isnull()])
train_real_users_count = len(queries[queries["is.test"] == False][~queries["userId"].isnull()]["userId"].unique())
test_real_users_count = len(queries[queries["is.test"] == True][~queries["userId"].isnull()]["userId"].unique())

#intersection between test and trAIN USERS 

unique_train_users = queries[queries["is.test"] == False][~queries["userId"].isnull()]["userId"].unique()
uniquer_test_users = queries[queries["is.test"] == True][~queries["userId"].isnull()]["userId"].unique()
train_inter_test_usercount = len(np.intersect1d(unique_train_users,uniquer_test_users))

In [25]:
#print statistics


print("query_full_queries_count : ",query_full_queries_count)
print("query_less_queries_count : ",query_less_queries_count)
print("sessions_count : ",sessions_count)
print("clicks_log_count : ",clicks_log_count)
print("views_log_count : ",views_log_count)
print("purchase_log_count : ",purchase_log_count)

#print user statistics
print("\n")

print("real_users_count : ",real_users_count)
print("anonymous_users_count : ",anonymous_users_count)
print("train_real_users_count : ",train_real_users_count)
print("test_real_users_count : ",test_real_users_count)
print("train_inter_test_usercount : ",train_inter_test_usercount)

query_full_queries_count :  51888
query_less_queries_count :  871239
sessions_count :  573935
clicks_log_count :  1127764
views_log_count :  1235380
purchase_log_count :  18025


real_users_count :  232816
anonymous_users_count :  574887
train_real_users_count :  140387
test_real_users_count :  116630
train_inter_test_usercount :  24201


In [26]:
#PRE-PROCESSING FOR FEATURE EXTRACTION

queries = queries[~queries["searchstring.tokens"].isnull()]

# Extract a mapping of each query and which items appeared
query_item = []
for query, items in queries[["queryId", "items"]].values:
    items = map(np.int64,items.split(','))
    for i in items:
        query_item.append( (query, i) )
query_item = pd.DataFrame().from_records(query_item, columns=["queryId","itemId"])

item_views.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId"], inplace=True)
clicks.sort_values(["queryId", "timeframe", "itemId"], inplace=True)
purchases.sort_values(["sessionId", "userId", "eventdate", "timeframe", "itemId", "ordernumber"], inplace=True)
products.sort_values(["itemId"], inplace=True)
products_category.sort_values(["itemId"], inplace=True)


query_item = pd.merge(query_item, queries[["queryId", "sessionId"]], how="left")
query_item = pd.merge(query_item,  clicks, how="left")
query_item.rename(columns={"timeframe":"clickTime"}, inplace=True)
query_item = pd.merge(query_item,  item_views, how="left")

query_item.rename(columns={"eventdate":"eventdateView", "timeframe":"viewTime", "userId": "userView"}, inplace=True)
query_item = pd.merge(query_item, purchases, how="left")
query_item.rename(columns={"eventdate":"eventdatePurchase", "timeframe":"purchaseTime", "userId": "userPurchase"}, inplace=True)


query_item["clicked"] = ~query_item["clickTime"].isnull()
query_item["viewed"] = ~query_item["viewTime"].isnull()
query_item["purchased"] = ~query_item["purchaseTime"].isnull()


user_item = []
for user, items in queries[["userId", "items"]].values:
    items = map(np.int64,items.split(','))
    for i in items:
        user_item.append( (user, i) )
user_item = pd.DataFrame().from_records(user_item, columns=["userShow","itemId"])
user_item = user_item[~user_item["userShow"].isnull()]

In [27]:
from functools import reduce

#Extracting global statistical features for each product

item_clicks = clicks.groupby("itemId").count().rename(columns = {"queryId" :"Item_clicks"})
product_views = item_views.groupby("itemId").count().rename(columns = {"sessionId" : "Item_view_count"})
item_purchases = purchases.groupby("itemId").count().rename(columns = {"sessionId" : "Item_purchase_count"})
item_show = query_item.groupby("itemId").count().rename(columns = {"queryId" : "item_show_count"})

userShow = user_item[["itemId","userShow"]].groupby("itemId")["userShow"].unique().apply(lambda x:len(x)).to_frame()
userView = pd.DataFrame(query_item[["itemId","userView"]].groupby("itemId")["userView"].unique().apply(lambda x:len(x)))
userPurchase = pd.DataFrame(query_item[["itemId","userPurchase"]].groupby("itemId")["userPurchase"].unique().apply(lambda x:len(x)))

dfs = [products[["itemId"]],item_show[["item_show_count"]],item_clicks[["Item_clicks"]],product_views[["Item_view_count"]],item_purchases[["Item_purchase_count"]],userShow,userView,userPurchase]
product_stats = reduce(lambda left,right: pd.merge(left,right,on='itemId',how="left"), dfs).sort_values(["itemId"]).reset_index().drop("index",axis = 1)

In [28]:
product_stats["CTR"] = product_stats["Item_clicks"]/product_stats["item_show_count"]
product_stats["View Rate"] = product_stats["Item_view_count"]/product_stats["item_show_count"]
product_stats["Click Value Rate"] = product_stats["Item_purchase_count"]/product_stats["item_show_count"]

In [10]:
def countwords(row):
    return row.count(",")+1

product_stats["wordLength"] = products["product.name.tokens"].apply(countwords)

In [29]:
products_info = pd.merge(query_item[["queryId", "itemId"]].drop_duplicates(), queries[["queryId", "searchstring.tokens"]], on="queryId", how="left").merge(products, on="itemId", how="left")

In [12]:
#Training Embedding models for queries and product tokens in their space by mapping each as a document. If results not significant to use word2vec
#To later save these models and load them rather than generating the model everytime

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def func2(df_t):
    return df_t.split(',')

reviews_gensim = list(queries["searchstring.tokens"].apply(func2))

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_gensim)]
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4,epochs=100,verbose = True) 


def findVector(df_val): #finding vector for the reviews based on doc2vec model
    review_words = df_val.split()
    return model.infer_vector(review_words)

queries['vector_gensim_query'] = queries["searchstring.tokens"].apply(findVector)

In [16]:
def func2(df_t):
    return df_t.split(',')

reviews_gensim1 = list(products["product.name.tokens"].apply(func2))

documents1 = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_gensim1)]
model2 = Doc2Vec(documents1, vector_size=50, min_count=1, workers=4,epochs=100,verbose = True) 


def findVector1(df_val): #finding vector for the reviews based on doc2vec model
    review_words = df_val.split()
    return model2.infer_vector(review_words)

products['vector_gensim_product'] = products["product.name.tokens"].apply(findVector1)

In [46]:
query_item =  pd.merge(query_item,queries[["queryId","vector_gensim_query"]],on="queryId",how="left")
product_stats = pd.merge(product_stats,products[["itemId","vector_gensim_product"]],on="itemId",how="left")

In [47]:
#defining relevance label

def relevance(row):
    if row["purchased"] == True:
        return 2
    elif row["clicked"] == True:
        return 1
    else:
        return 0

query_item["Relevance"] = query_item.apply(relevance,axis=1)

In [48]:
query_item_input = query_item[["queryId","itemId","vector_gensim_query","Relevance"]]
query_item_input = pd.merge(query_item_input,queries[["queryId","is.test"]],on="queryId",how="left")
queries_input = queries[["queryId","items"]]

In [66]:
product_stats.to_pickle("product_stats.pkl")

In [69]:
query_item_input.to_pickle("query_item.pkl")

In [70]:
queries_input.to_pickle("queries_input.pkl")

In [74]:
#Processed data stored in three pickle objects. product_stats.pkl, query_item.pkl, queries_input.pkl

###SHARED THE PICKLE FILE IN DRIVE AS DATA IS HUGE TO UPLOAD IN GITHUB. PATH : "info_project_files/CSCE-670---COURSE-PROJECT/Feature Extraction: pickle objects " ###

#queries_input.pkl : shows the original queryID and corresponding items order. Output should be in this format. Also refer to "baseline_submission.txt" for further format.
#query_item.pkl: contains the queryID, itemID corresponding to each input to model. It contains FEATURE 1: QUERY EMBEDDING VECTOR. AND OUTPUT LABELS: Relevance scores for each query item pair.
#product_stats.pkl : contains the product features. 
    
    
# STEPS:
# 1) Read the pickles. Format : df = pd.read_pickle("product_stats.pkl")
# 2) Separate train, test from query_item.pkl and the output labels.
# 3) Take a data point from train. 
#   Eg: (a) 1st train point is queryID : 1, ItemID : 7518 .
#       (b) FEATURE 1: Query Embedding vector : vector_gensim_query
#       (c) For itemID : 7518, get its Product Embedding vector from product_stats.pkl dataframe ie, 
#           product_stats[product_stats["itemId"]== 7518]["vector_gensim_product"]
#       (d) Get all other product features except vector_gensim_product of item id = 7518, ie,
#          product_stats[product_stats["itemId"]== 7518] except vector_gensim_product
#       (e) FEATURE 1: WORD EMBEDDING FOR QUERY, FEATURE 2: WORD EMBEDDING FOR PRODUCT, FEATURE 3: PRODUCT FEATURES, LABEL : RELEVANCE
#       (f) Train the features this way and test the test data and compare the relavance using the prediction model.


In [87]:
#Additional features yet to do: 

# 1) Session based features
# 2) Time based Global statistical features