In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path # working with paths
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import bisect
from itertools import product
import shutil
import concurrent.futures
import pickle
import time
import random


# Setting data paths with pathlib
data_path = Path('/home/mai22042/otto/data')
train_path = data_path/'train.jsonl'
test_path = data_path/'test.jsonl'
sample_sub_path = Path('sample_submission.csv')

from enum import Enum
class RName(Enum):
    CLICK = 'clicks'
    CART = 'carts'
    ORDER = 'orders'

    def __str__(self):
        return '%s' % self.name
    def __repr__(self):
        return '%s' % self.name
        
# Set aliases
CLICK=RName.CLICK
CART=RName.CART
ORDER=RName.ORDER

# create a dict with key [aid,type], val (session)

from collections import defaultdict

class DT3:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                key = event["aid"], RName(event['type'])
                if key not in self.products.keys():
                    self.products[key] = set()
                self.products[key].add(session)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.products = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

class DT2:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                aid = event["aid"]
                key = session, RName(event['type'])
                if key not in self.sessions.keys():
                    self.sessions[key] = set()
                self.sessions[key].add(aid)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.sessions = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

# Loading .jsonl file and creating DT object
#start =time.time()
#test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)
#test = DT3(test_chunks)
#print ("Time elapsed:", time.time() - start)
#del test_chunks # Delete chunks

#                   CREATING TRAIN DT3

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# train = DT3(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks

#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt3.dat', 'wb') as f:
#     pickle.dump(train, f)
# print ("Time elapsed:", time.time() - start)



#                   LOAD TRAIN DT3 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
train = pickle.load(open("train_full_dt3.dat", "rb"))
print ("Time elapsed:", time.time() - start)

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# traindt2 = DT2(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks
#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt2.dat', 'wb') as f:
#     pickle.dump(traindt2, f)
# print ("Time elapsed:", time.time() - start)

#                   LOAD TRAIN DT2 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
traindt2 = pickle.load(open("train_full_dt2.dat", "rb"))
print ("Time elapsed:", time.time() - start)

Time elapsed: 34.96925115585327
Time elapsed: 62.69053626060486


In [13]:
from tqdm import tqdm
import csv

test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)


def r_type(set_a, set_b):
    if set_a==None or set_b==None:
        return 0
    if set_a == 0 or set_b == 0 :
        return 0
    intersection_len = len(set_a&set_b)
    union_len = len(set_a|set_b)
    if intersection_len == 0 or union_len == 0 :
        return 0
    else:
        return intersection_len / union_len

# headerList = ['session_type','labels']
# with open("last"+".csv", 'w') as file:
#     dw = csv.DictWriter(file, delimiter=',', fieldnames=headerList)
#     dw.writeheader()
# np.set_printoptions(precision=5)
np.set_printoptions()  
rdict = {}
for chunk in test_chunks:
    for session, events in tqdm(zip(chunk['session'].tolist(), chunk['events'].tolist())):
        train_session_set=set()
        test_product_click=set()
        test_product_cart=set()
        test_product_order=set()
        for event in events:
            event_type = RName(event['type'])
            aid = event["aid"]
            if event_type==CART:
                test_product_cart.add(aid)
            elif event_type==ORDER:
                test_product_order.add(aid)
            else:
                test_product_click.add(aid)
            
            key_train = aid,event_type
            key_test = session,event_type
            if key_train in train.products.keys():
                train_session_set|=train.products[key_train]

        rscores = np.array(np.zeros(shape=(0,4))).astype(object)

        for session_train in train_session_set:
            r_click = r_type(test_product_click,traindt2.sessions.get((session_train,CLICK)))
            r_cart = r_type(test_product_cart,traindt2.sessions.get((session_train,CART)))
            r_order = r_type(test_product_order,traindt2.sessions.get((session_train,ORDER)))
            new_row = np.array([session_train, r_click, r_cart, r_order])
            # print("TRAIN SESSION:", session_train)
            # print("NEW ROW:",new_row)
            rscores = np.vstack((rscores, new_row))

        rscores[:,0] = rscores[ :, 0].astype(int)
        rscores = np.hstack((rscores, (np.dot(rscores[ :, 1:4],np.array([0.1, 0.3, 0.6]))).reshape(-1,1)))
        rscores = rscores[(-rscores[:, -1]).argsort()][:20]



        # for i in range(rscores[i].shape[0]): #for each row
        #     rscores[i] = np.insert(rscores[i], -1, np.dot(rscores[i][1:4], np.array([0.1, 0.3, 0.6])))


            # r_score = np.dot(np.array([r_click,r_cart,r_order]), np.array([0.1, 0.3, 0.6]))
            # np.append(rscores, r_score, axis = 1)
            # r_score = np.dot(np.array([r_click,r_cart,r_order]), np.array([0.1, 0.3, 0.6]))
            
        # rscores = rscores[(-rscores[:, -1]).argsort()][:20]
        # print(rscores)
        
        # print(session)
        # rdict[session]=rscores
        

[108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037, 1603001, 673407, 1586171, 1196256, 620545, 332654, 819288, 199409, 1236775, 986164, 1645990, 102345, 794192, 1022566]
dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [110]:

print(top_click.product_count.items())


dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [22]:
from tqdm import tqdm
import csv

test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)


def r_type(set_a, set_b):
    if set_a==None or set_b==None:
        return 0
    if set_a == 0 or set_b == 0 :
        return 0
    intersection_len = len(set_a&set_b)
    union_len = len(set_a|set_b)
    if intersection_len == 0 or union_len == 0 :
        return 0
    else:
        return intersection_len / union_len

# headerList = ['session_type','labels']
# with open("last"+".csv", 'w') as file:
#     dw = csv.DictWriter(file, delimiter=',', fieldnames=headerList)
#     dw.writeheader()

rdict = {}
for chunk in test_chunks:
    for session, events in tqdm(zip(chunk['session'].tolist(), chunk['events'].tolist())):
        train_session_set=set()
        test_product_click=set()
        test_product_cart=set()
        test_product_order=set()
        for event in events:
            event_type = RName(event['type'])
            aid = event["aid"]
            if event_type==CART:
                test_product_cart.add(aid)
            elif event_type==ORDER:
                test_product_order.add(aid)
            else:
                test_product_click.add(aid)
            
            key_train = aid,event_type
            key_test = session,event_type
            if key_train in train.products.keys():
                train_session_set|=train.products[key_train]

        rscores=[]
        for session_train in train_session_set:
            r_click=r_type(test_product_click,traindt2.sessions.get((session_train,CLICK)))
            r_cart=r_type(test_product_cart,traindt2.sessions.get((session_train,CART)))
            r_order=r_type(test_product_order,traindt2.sessions.get((session_train,ORDER)))
            r_score=r_click*0.1+r_cart*0.3+r_order*0.6
            rscores.append((session_train, r_click,r_cart,r_order,r_score))
            
            if len(rscores)>20000:
                rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:10000]
        rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:10000]
        # print(session)
        rdict[session]=rscores
        print(session,rscores)
        break
        

0it [00:00, ?it/s]


12899779 [(6495761, 0.3333333333333333, 0, 0, 0.03333333333333333), (9459022, 0.125, 0, 0, 0.0125), (1007106, 0.1, 0, 0, 0.010000000000000002), (4606227, 0.09090909090909091, 0, 0, 0.009090909090909092), (3540779, 0.058823529411764705, 0, 0, 0.0058823529411764705), (9363880, 0.047619047619047616, 0, 0, 0.004761904761904762), (7141243, 0.047619047619047616, 0, 0, 0.004761904761904762), (1707610, 0.014084507042253521, 0, 0, 0.0014084507042253522), (714105, 0.00980392156862745, 0, 0, 0.000980392156862745), (106478, 0.009523809523809525, 0, 0, 0.0009523809523809525), (2287043, 0.007633587786259542, 0, 0, 0.0007633587786259542)]


0it [00:00, ?it/s]


13299779 [(7462948, 1.0, 0, 0, 0.1), (4472909, 1.0, 0, 0, 0.1), (11640934, 1.0, 0, 0, 0.1), (11067506, 1.0, 0, 0, 0.1), (11378813, 1.0, 0, 0, 0.1), (9281683, 1.0, 0, 0, 0.1), (10510484, 1.0, 0, 0, 0.1), (11673783, 1.0, 0, 0, 0.1), (11526462, 1.0, 0, 0, 0.1), (8118612, 1.0, 0, 0, 0.1), (8749425, 1.0, 0, 0, 0.1), (11280991, 1.0, 0, 0, 0.1), (9339538, 1.0, 0, 0, 0.1), (7013067, 1.0, 0, 0, 0.1), (9380573, 1.0, 0, 0, 0.1), (10503364, 1.0, 0, 0, 0.1), (9487608, 1.0, 0, 0, 0.1), (11126351, 1.0, 0, 0, 0.1), (11044603, 1.0, 0, 0, 0.1), (7817011, 1.0, 0, 0, 0.1), (10733503, 1.0, 0, 0, 0.1), (9562054, 1.0, 0, 0, 0.1), (9611270, 1.0, 0, 0, 0.1), (10987629, 1.0, 0, 0, 0.1), (6547675, 1.0, 0, 0, 0.1), (8825121, 1.0, 0, 0, 0.1), (9005423, 1.0, 0, 0, 0.1), (5900712, 1.0, 0, 0, 0.1), (8473157, 1.0, 0, 0, 0.1), (9734822, 1.0, 0, 0, 0.1), (10775250, 1.0, 0, 0, 0.1), (10373872, 1.0, 0, 0, 0.1), (11201325, 1.0, 0, 0, 0.1), (11316166, 1.0, 0, 0, 0.1), (9284627, 1.0, 0, 0, 0.1), (8121762, 1.0, 0, 0, 0.1), (9

0it [00:00, ?it/s]


13699779 [(10376928, 0.5, 0, 0, 0.05), (9113813, 0.3333333333333333, 0, 0, 0.03333333333333333), (9701714, 0.3333333333333333, 0, 0, 0.03333333333333333), (12857898, 0.3333333333333333, 0, 0, 0.03333333333333333), (11863213, 0.3333333333333333, 0, 0, 0.03333333333333333), (9399613, 0.3333333333333333, 0, 0, 0.03333333333333333), (7806321, 0.3333333333333333, 0, 0, 0.03333333333333333), (9745996, 0.3333333333333333, 0, 0, 0.03333333333333333), (11497103, 0.3333333333333333, 0, 0, 0.03333333333333333), (12822176, 0.3333333333333333, 0, 0, 0.03333333333333333), (9473822, 0.3333333333333333, 0, 0, 0.03333333333333333), (11921255, 0.3333333333333333, 0, 0, 0.03333333333333333), (8581022, 0.3333333333333333, 0, 0, 0.03333333333333333), (9097265, 0.25, 0, 0, 0.025), (5603606, 0.25, 0, 0, 0.025), (10916253, 0.25, 0, 0, 0.025), (1618463, 0.25, 0, 0, 0.025), (8641091, 0.25, 0, 0, 0.025), (1940079, 0.25, 0, 0, 0.025), (10452103, 0.25, 0, 0, 0.025), (5320094, 0.25, 0, 0, 0.025), (2184641, 0.25, 0,

0it [00:00, ?it/s]


14099779 [(10587157, 0.09090909090909091, 1.0, 0, 0.3090909090909091), (340417, 0, 1.0, 0, 0.3), (7574224, 0, 1.0, 0, 0.3), (2143165, 0, 1.0, 0, 0.3), (10605546, 0, 1.0, 0, 0.3), (11847458, 0, 1.0, 0, 0.3), (10434474, 0, 1.0, 0, 0.3), (12769326, 0, 1.0, 0, 0.3), (899290, 0, 1.0, 0, 0.3), (8817001, 0, 1.0, 0, 0.3), (11049779, 0, 1.0, 0, 0.3), (1895882, 0, 1.0, 0, 0.3), (9355047, 0, 1.0, 0, 0.3), (1130936, 0, 0.5, 0, 0.15), (1991300, 0, 0.5, 0, 0.15), (742091, 0, 0.5, 0, 0.15), (2098114, 0, 0.5, 0, 0.15), (1701093, 0, 0.5, 0, 0.15), (9480, 0, 0.5, 0, 0.15), (1676898, 0, 0.5, 0, 0.15), (4974197, 0, 0.5, 0, 0.15), (1702003, 0, 0.5, 0, 0.15), (6359490, 0, 0.5, 0, 0.15), (6818324, 0, 0.5, 0, 0.15), (3456041, 0, 0.5, 0, 0.15), (4390137, 0, 0.5, 0, 0.15), (10800562, 0, 0.5, 0, 0.15), (2342333, 0, 0.5, 0, 0.15), (2260614, 0, 0.5, 0, 0.15), (1007314, 0, 0.5, 0, 0.15), (12500828, 0, 0.5, 0, 0.15), (11303345, 0.125, 0.3333333333333333, 0, 0.11249999999999999), (11630424, 1.0, 0, 0, 0.1), (10976744

0it [00:00, ?it/s]

14499779 [(11272343, 0.5, 0, 0, 0.05), (4274006, 0.5, 0, 0, 0.05), (11112091, 0.5, 0, 0, 0.05), (796001, 0.42857142857142855, 0, 0, 0.04285714285714286), (10537390, 0.42857142857142855, 0, 0, 0.04285714285714286), (8460434, 0.42857142857142855, 0, 0, 0.04285714285714286), (1355904, 0.375, 0, 0, 0.037500000000000006), (11127683, 0.375, 0, 0, 0.037500000000000006), (10081986, 0.375, 0, 0, 0.037500000000000006), (3015945, 0.375, 0, 0, 0.037500000000000006), (7603006, 0.3333333333333333, 0, 0, 0.03333333333333333), (792477, 0.3333333333333333, 0, 0, 0.03333333333333333), (4210788, 0.3333333333333333, 0, 0, 0.03333333333333333), (7357936, 0.3333333333333333, 0, 0, 0.03333333333333333), (3425925, 0.3333333333333333, 0, 0, 0.03333333333333333), (11553571, 0.3333333333333333, 0, 0, 0.03333333333333333), (809300, 0.3333333333333333, 0, 0, 0.03333333333333333), (12868663, 0.3333333333333333, 0, 0, 0.03333333333333333), (10510267, 0.3333333333333333, 0, 0, 0.03333333333333333), (12083384, 0.33333




In [9]:
import sys
print("The size of the dictionary is {} bytes".format(sys.getsizeof(rdict)))

The size of the dictionary is 224 bytes


In [21]:
print(len(rdict.keys()))
print(rdict.keys())

5
dict_keys([12899779, 13299779, 13699779, 14099779, 14499779])


In [4]:
#                   SAVING 1000 best sessions on pickle file

# Saving rdict on pickle file

start =time.time()
with open('r_dict_10000.dat', 'wb') as f:
    pickle.dump(rdict, f)
print ("Time elapsed:", time.time() - start)

Time elapsed: 0.0010797977447509766


[(11272343, 0.5, 0, 0, 0.05), (4274006, 0.5, 0, 0, 0.05), (11112091, 0.5, 0, 0, 0.05), (796001, 0.42857142857142855, 0, 0, 0.04285714285714286), (10537390, 0.42857142857142855, 0, 0, 0.04285714285714286), (8460434, 0.42857142857142855, 0, 0, 0.04285714285714286), (1355904, 0.375, 0, 0, 0.037500000000000006), (11127683, 0.375, 0, 0, 0.037500000000000006), (10081986, 0.375, 0, 0, 0.037500000000000006), (3015945, 0.375, 0, 0, 0.037500000000000006), (7603006, 0.3333333333333333, 0, 0, 0.03333333333333333), (792477, 0.3333333333333333, 0, 0, 0.03333333333333333), (4210788, 0.3333333333333333, 0, 0, 0.03333333333333333), (7357936, 0.3333333333333333, 0, 0, 0.03333333333333333), (3425925, 0.3333333333333333, 0, 0, 0.03333333333333333), (11553571, 0.3333333333333333, 0, 0, 0.03333333333333333), (809300, 0.3333333333333333, 0, 0, 0.03333333333333333), (12868663, 0.3333333333333333, 0, 0, 0.03333333333333333), (10510267, 0.3333333333333333, 0, 0, 0.03333333333333333), (12083384, 0.33333333333333

In [66]:
print(sugest_click)
print(sugest_click[:20-2])

[582150, 1095682, 108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037, 1603001, 673407]
[582150, 1095682, 108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037]


In [None]:
# Printing stuff from dt objects
for i in range(5):
    print('Session i:',traindt.products[i][0])
    print('Contains this list:')
    list=traindt.products[i]
    for j, item in enumerate(list):
        if j == 0:
            print('Session i:',item)
        elif j == 1:
            print('Clicks:',item)
        elif j == 2:
            print('Carts:',item)
        else:
            print('Orders:',item)
            
for i in range(5):
    print('or like this:')
    print(traindt.products[i])

In [33]:
print(list(test.products.keys())[0])

(59625, CLICK)


In [5]:
print("test sessions: ",len(test.sessions))
print("train sessions: ",len(train.sessions))

test sessions:  1671803
train sessions:  400000


In [32]:
print("test.sessions: ",(test.sessions[0:5]))
print("test.aid: ",(test.aid[0:10]))


print("train.sessions: ",(train.sessions[0:5]))
print("train.aid: ",(train.aid[0:10]))

NameError: name 'test' is not defined

In [23]:
print(train.products[110])

print(test.products[200])

[1048733, {11840352, 11978145, 11895717, 11916787, 11669717, 11953302}, {11953302}, None]
[470, {13260258, 12949881, 13233561}, None, None]
