In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path # working with paths
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import bisect
from itertools import product
import shutil
import concurrent.futures
import pickle
import time
import random


# Setting data paths with pathlib
data_path = Path('/home/mai22042/otto/data')
train_path = data_path/'train.jsonl'
test_path = data_path/'test.jsonl'
sample_sub_path = Path('sample_submission.csv')

from enum import Enum
class RName(Enum):
    CLICK = 'clicks'
    CART = 'carts'
    ORDER = 'orders'

    def __str__(self):
        return '%s' % self.name
    def __repr__(self):
        return '%s' % self.name
        
# Set aliases
CLICK=RName.CLICK
CART=RName.CART
ORDER=RName.ORDER

# create a dict with key [aid,type], val (session)

from collections import defaultdict

class DT3:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                key = event["aid"], RName(event['type'])
                if key not in self.products.keys():
                    self.products[key] = set()
                self.products[key].add(session)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.products = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

class DT2:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                aid = event["aid"]
                key = session, RName(event['type'])
                if key not in self.sessions.keys():
                    self.sessions[key] = set()
                self.sessions[key].add(aid)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.sessions = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

# Loading .jsonl file and creating DT object
#start =time.time()
#test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)
#test = DT3(test_chunks)
#print ("Time elapsed:", time.time() - start)
#del test_chunks # Delete chunks

#                   CREATING TRAIN DT3

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# train = DT3(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks

#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt3.dat', 'wb') as f:
#     pickle.dump(train, f)
# print ("Time elapsed:", time.time() - start)



#                   LOAD TRAIN DT3 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
train = pickle.load(open("train_full_dt3.dat", "rb"))
print ("Time elapsed:", time.time() - start)

#                   CREATING TRAIN DT2

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# traindt2 = DT2(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks
#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt2.dat', 'wb') as f:
#     pickle.dump(traindt2, f)
# print ("Time elapsed:", time.time() - start)

#                   LOAD TRAIN DT2 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
traindt2 = pickle.load(open("train_full_dt2.dat", "rb"))
print ("Time elapsed:", time.time() - start)

Time elapsed: 48.762688398361206
Time elapsed: 75.98323082923889


In [2]:
# Loading .jsonl file and creating DT object
# start =time.time()
# test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)
# test = DT3(test_chunks)
# print ("Time elapsed:", time.time() - start)
# del test_chunks # Delete chunks

# start =time.time()
# with open('test_dt3.dat', 'wb') as f:
#     pickle.dump(test, f)
# print ("Time elapsed:", time.time() - start)

# Loading .jsonl file and creating DT object
# start =time.time()
# test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)
# testdt2 = DT2(test_chunks)
# print ("Time elapsed:", time.time() - start)
# del test_chunks # Delete chunks
# Saving object on pickle file

# start =time.time()
# with open('test_dt2.dat', 'wb') as f:
#     pickle.dump(testdt2, f)
# print ("Time elapsed:", time.time() - start)

start =time.time()
testdt2 = pickle.load(open("test_dt2.dat", "rb"))
print ("Time elapsed:", time.time() - start)

Time elapsed: 1.188821792602539


In [3]:
import math

class Sugest:
    def __init__(self, list_of_sets, factor=1, weight=lambda x:x):
        self.product_count = {}
        self.factor=factor
        # def radial_basis_gravity(distance = (1-d), where d=R, a):
        self.weight=weight #lambda x,y=0.1:math.exp(-(1-x)**2 / y**2)
        self.add_more(list_of_sets)

    def add_more(self,list_of_sets, weight=lambda x:x):
        self.weight=weight
        # self.alpha=alpha
        for i in list_of_sets:
            if i!=None:
                for p in i:
                    self._add(p)

    def add_one(self, set, factor, weight=lambda x:x):
        self.factor=factor
        self.weight=weight
        # self.alpha=alpha
        if set!=None:
            for p in set:
                self._add(p)

    def reset_to_top(self):
        self.product_count=self.backup.copy()

    def _add(self, product):
        if product in self.product_count.keys():
            if self.product_count[product]<0:
                self.product_count[product]=self.weight(self.factor)
            else:
                self.product_count[product]+=self.weight(self.factor)
        else:
            self.product_count[product]=self.weight(self.factor)

    def top(self):
        sorted_items = self.result(30)
        tmp_dict={}
        for e,i in enumerate(sorted_items):
            tmp_dict[i]=int(-1-e)
        self.product_count=tmp_dict
        self.backup=tmp_dict.copy() #copy
        

    def result(self, number=20):
        return  [
            key for (key, value) in sorted(
                self.product_count.items(), key=lambda x: x[1],reverse=True
                )
            ][:number]

# get top products of train
train_click_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==CLICK]
top_click=Sugest(train_click_sets)
print(top_click.result(30))
train_cart_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==CART]
top_cart=Sugest(train_cart_sets)
train_order_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==ORDER]
top_order=Sugest(train_order_sets)

top_click.top()
top_cart.top()
top_order.top()
print(top_click.product_count.items())


[108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037, 1603001, 673407, 1586171, 1196256, 620545, 332654, 819288, 199409, 1236775, 986164, 1645990, 102345, 794192, 1022566]
dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [110]:

print(top_click.product_count.items())


dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [53]:
from tqdm import tqdm
import csv

def r_type(set_a, set_b):
    if set_a==None or set_b==None:
        return 0
    if set_a == 0 or set_b == 0 :
        return 0
    intersection_len = len(set_a&set_b)
    union_len = len(set_a|set_b)
    if intersection_len == 0 or union_len == 0 :
        return 0
    else:
        return intersection_len / union_len

# headerList = ['session_type','labels']
# with open("last"+".csv", 'w') as file:
#     dw = csv.DictWriter(file, delimiter=',', fieldnames=headerList)
#     dw.writeheader()

# rdict = []
test_sessions=sorted(list(set([k[0] for k in testdt2.sessions.keys()])))
# print([k[0] for k in testdt2.sessions.keys()][:10])
# print(test_sessions[:10])
for session in tqdm(test_sessions):
    train_session_set=set()
    test_product_click=testdt2.sessions.get((session,CLICK))
    test_product_cart=testdt2.sessions.get((session,CART))
    test_product_order=testdt2.sessions.get((session,ORDER))
    if test_product_click is None:
        test_product_click=set()
    if test_product_cart is None:
        test_product_cart=set()
    if test_product_order is None:
        test_product_order=set()
    for aid in test_product_click:
        key_train = aid,CLICK
        if key_train in train.products.keys():
            train_session_set|=train.products[key_train]
    for aid in test_product_cart:
        key_train = aid,CART
        if key_train in train.products.keys():
            train_session_set|=train.products[key_train]
    for aid in test_product_order:
        key_train = aid,ORDER
        if key_train in train.products.keys():
            train_session_set|=train.products[key_train]
    rscores=[]
    for session_train in train_session_set:
        r_click=r_type(test_product_click,traindt2.sessions.get((session_train,CLICK)))
        r_cart=r_type(test_product_cart,traindt2.sessions.get((session_train,CART)))
        r_order=r_type(test_product_order,traindt2.sessions.get((session_train,ORDER)))
        r_score=r_click*0.1+r_cart*0.3+r_order*0.6
        rscores.append((session_train, r_click,r_cart,r_order,r_score))
        
        if len(rscores)>5000:
            rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:1000]
    rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:1000]
    # print(session,rscores)
    # rdict.append((session, rscores))
    with open("rscores_1000"+".txt", 'a+') as f:
        f.write(str(session)+" ")
        for i in rscores:
            f.write(str(i[0])+" "+str(i[1])+" "+str(i[2])+" "+str(i[3])+" "+str(i[4])+" ")
        f.write("\n")
    

        

100%|██████████| 1671803/1671803 [12:15:55<00:00, 37.86it/s]   


In [12]:
from tqdm import tqdm
import csv
import numpy as np
def get_set_length(s):
    return len(s) if s else 0

alpha = [0.5, 0.3, 0.108574, 0.1, 0.02] # list of alpha values
for i, val in enumerate(alpha): # loop through the values
    print("Value of alpha: {:.3f}".format(val))
    filename = "ctc_ccto_20_rbf{:.3f}.csv".format(val) # create a filename using alpha value    
    headerList = ['session_type','labels']
    with open(filename, 'w', newline='') as csvfile: # write headers 
        dw = csv.DictWriter(csvfile, delimiter=',', fieldnames=headerList)
        dw.writeheader()

    # def radial_basis_gravity(distance = (1-d), where d=R, a):
    # weight=lambda x: math.sqrt(math.pow(x, 2) + math.pow(val, 2))
    weight=lambda x:math.exp(-(1-x)**2 / val)
    # weight= lambda x: np.sqrt(np.power(x, 2) + np.power(val, 2)) #sloww

    # txt file contains 1000 nearest Rscores for every test session
    with open('rscores_20.txt', 'r') as f:
        # count=0
        for line in tqdm(f):
            # count+=1
            # if count==5:
            #     break
            values = line.strip().split()
            test_id = int(values[0])
            top_cart.add_one( testdt2.sessions.get((test_id,CLICK)), 100,lambda x:x) #0.5 clicks to cart          
            top_order.add_one(testdt2.sessions.get((test_id,CLICK)), 100,lambda x:x) #0.5 clicks to order
            top_order.add_one(testdt2.sessions.get((test_id,CART)), 500,lambda x:x) #0.8 carts to order
            for i in range(1, len(values), 5):
                train_id = int(values[i])
                # r_click = float(values[i+1])
                # r_cart = float(values[i+2])
                # r_order = float(values[i+3])
                r_score = float(values[i+4])
                # weight=lambda x:math.exp(-(1-x)**2 /get_set_length(traindt2.sessions.get((train_id,CLICK))))
                # weight=lambda x: math.sqrt(math.pow(x, 2) + math.pow(get_set_length(traindt2.sessions.get((train_id,CLICK))), 2))
                
                top_click.add_one(traindt2.sessions.get((train_id,CLICK)), r_score, weight)
                # weight=lambda x: math.sqrt(math.pow(x, 2) + math.pow(get_set_length(traindt2.sessions.get((train_id,CART))), 2))
                
                # weight=lambda x:math.exp(-(1-x)**2 / get_set_length(traindt2.sessions.get((train_id,CART))))
                top_cart.add_one( traindt2.sessions.get((train_id,CART)),  r_score, weight)
                # weight=lambda x: math.sqrt(math.pow(x, 2) + math.pow(get_set_length(traindt2.sessions.get((train_id,ORDER))), 2))

                # weight=lambda x:math.exp(-(1-x)**2 / get_set_length(traindt2.sessions.get((train_id,ORDER))))
                top_order.add_one(traindt2.sessions.get((train_id,ORDER)), r_score, weight)
            sugest_click=top_click.result()
            sugest_cart=top_cart.result()
            sugest_order=top_order.result()            
            top_click.reset_to_top()
            top_cart.reset_to_top()
            top_order.reset_to_top()            
            with open(filename, 'a+') as h:
                h.write(str(test_id)+"_"+CLICK.value+","+" ".join(map(str,sugest_click))+"\n")
                h.write(str(test_id)+"_"+CART.value+","+" ".join(map(str,sugest_cart))+"\n")
                h.write(str(test_id)+"_"+ORDER.value+","+" ".join(map(str,sugest_order))+"\n")


Value of alpha: 0.120


1671803it [04:06, 6778.41it/s]


Value of alpha: 0.100


1671803it [04:07, 6743.52it/s]
