In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path # working with paths
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import bisect
from itertools import product
import shutil
import concurrent.futures
import pickle
import time
import random
# https://www.kaggle.com/code/columbia2131/otto-read-a-chunk-of-jsonl-to-manageable-df/

# Setting data paths with pathlib
data_path = Path('/home/mai22042/otto/data')
train_path = data_path/'train.jsonl'
test_path = data_path/'test.jsonl'
sample_sub_path = Path('sample_submission.csv')

from enum import Enum
class RName(Enum):
    CLICK = 'clicks'
    CART = 'carts'
    ORDER = 'orders'

    def __str__(self):
        return '%s' % self.name
    def __repr__(self):
        return '%s' % self.name
        
# Set aliases
CLICK=RName.CLICK
CART=RName.CART
ORDER=RName.ORDER

# create a dict with key [aid,type], val (session)

from collections import defaultdict

class DT3:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                key = event["aid"], RName(event['type'])
                if key not in self.products.keys():
                    self.products[key] = set()
                self.products[key].add(session)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.products = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

class DT2:
    def init_helper(self, chunk):
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                aid = event["aid"]
                key = session, RName(event['type'])
                if key not in self.sessions.keys():
                    self.sessions[key] = set()
                self.sessions[key].add(aid)

    def __init__(self, chunks, i=None, j=None, sampling=False):
        self.sessions = {}
        samples = []
        if sampling:
            samples = random.sample(range(34),1)
            samples.sort()
            print(samples)
        for e, chunk in enumerate(chunks):
            if (i==None and j==None) or (i<=e and j>e):
                if sampling:
                    if e in samples:
                        print("test chunk number",e)
                        self.init_helper(chunk)
                else:
                    print("test chunk number",e)
                    self.init_helper(chunk)

# Loading .jsonl file and creating DT object
#start =time.time()
#test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)
#test = DT3(test_chunks)
#print ("Time elapsed:", time.time() - start)
#del test_chunks # Delete chunks

#                   CREATING TRAIN DT3

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# train = DT3(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks

#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt3.dat', 'wb') as f:
#     pickle.dump(train, f)
# print ("Time elapsed:", time.time() - start)



#                   LOAD TRAIN DT3 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
train = pickle.load(open("train_full_dt3.dat", "rb"))
print ("Time elapsed:", time.time() - start)

# Loading .jsonl file and creating DT object
# start =time.time()
# train_chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=400000)
# traindt2 = DT2(train_chunks)
# print ("Time elapsed:", time.time() - start)
# del train_chunks # Delete chunks
#                   SAVING TRAIN DT3

# Saving object on pickle file

# start =time.time()
# with open('train_full_dt2.dat', 'wb') as f:
#     pickle.dump(traindt2, f)
# print ("Time elapsed:", time.time() - start)

#                   LOAD TRAIN DT2 FROM PICKLE FILE

# Loading object from pickle file

start =time.time()
traindt2 = pickle.load(open("train_full_dt2.dat", "rb"))
print ("Time elapsed:", time.time() - start)

Time elapsed: 14.613240003585815
Time elapsed: 83.48160600662231


In [108]:
class Sugest:
    def __init__(self, list_of_sets):
        self.product_count = {}
        self.add_more(list_of_sets)

    def add_more(self,list_of_sets):
        for i in list_of_sets:
            if i!=None:
                for p in i:
                    self._add(p)

    def reset_to_top(self):
        self.product_count=self.backup.copy()

    def _add(self, product):
        if product in self.product_count.keys():
            if self.product_count[product]<0:
                self.product_count[product]=1
            else:
                self.product_count[product]+=1
        else:
            self.product_count[product]=1

    def top(self):
        sorted_items = self.result(30)
        tmp_dict={}
        for e,i in enumerate(sorted_items):
            tmp_dict[i]=int(-1-e)
        self.product_count=tmp_dict
        self.backup=tmp_dict.copy() #copy
        

    def result(self, number=20):
        return  [
            key for (key, value) in sorted(
                self.product_count.items(), key=lambda x: x[1],reverse=True
                )
            ][:number]

# get top products of train
train_click_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==CLICK]
top_click=Sugest(train_click_sets)
print(top_click.result(30))
train_cart_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==CART]
top_cart=Sugest(train_cart_sets)
train_order_sets=[traindt2.sessions.get(tid) for tid in traindt2.sessions.keys() if tid[1]==ORDER]
top_order=Sugest(train_order_sets)

top_click.top()
top_cart.top()
top_order.top()
print(top_click.product_count.items())


[108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037, 1603001, 673407, 1586171, 1196256, 620545, 332654, 819288, 199409, 1236775, 986164, 1645990, 102345, 794192, 1022566]
dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [110]:

print(top_click.product_count.items())


dict_items([(108125, -1), (1460571, -2), (29735, -3), (184976, -4), (95488, -5), (1502122, -6), (1733943, -7), (959208, -8), (322370, -9), (231487, -10), (756588, -11), (832192, -12), (554660, -13), (801774, -14), (1083665, -15), (166037, -16), (1603001, -17), (673407, -18), (1586171, -19), (1196256, -20), (620545, -21), (332654, -22), (819288, -23), (199409, -24), (1236775, -25), (986164, -26), (1645990, -27), (102345, -28), (794192, -29), (1022566, -30)])


In [None]:
from tqdm import tqdm
import csv

test_chunks  = pd.read_json(data_path / 'test.jsonl', lines=True, chunksize=400000)


def r_type(set_a, set_b):
    if set_a==None or set_b==None:
        return 0
    if set_a == 0 or set_b == 0 :
        return 0
    intersection_len = len(set_a&set_b)
    union_len = len(set_a|set_b)
    if intersection_len == 0 or union_len == 0 :
        return 0
    else:
        return intersection_len / union_len

headerList = ['session_type','labels']
with open("test_output_full"+".csv", 'w') as file:
    dw = csv.DictWriter(file, delimiter=',', fieldnames=headerList)
    dw.writeheader()

for chunk in test_chunks:
    for session, events in tqdm(zip(chunk['session'].tolist(), chunk['events'].tolist())):
        train_session_set=set()
        test_product_click=set()
        test_product_cart=set()
        test_product_order=set()
        for event in events:
            event_type = RName(event['type'])
            aid = event["aid"]
            if event_type==CART:
                test_product_cart.add(aid)
            elif event_type==ORDER:
                test_product_order.add(aid)
            else:
                test_product_click.add(aid)
            
            key_train = aid,event_type
            key_test = session,event_type
            if key_train in train.products.keys():
                train_session_set|=train.products[key_train]
                
        with open("test_output_full"+".csv", 'a+') as f:
            rscores=[]
            for session_train in train_session_set:
                r_click=r_type(test_product_click,traindt2.sessions.get((session_train,CLICK)))
                r_cart=r_type(test_product_cart,traindt2.sessions.get((session_train,CART)))
                r_order=r_type(test_product_order,traindt2.sessions.get((session_train,ORDER)))
                r_score=r_click*0.1+r_cart*0.3+r_order*0.6
                rscores.append((session_train, r_click,r_cart,r_order,r_score))
                if len(rscores)>10000:
                    rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:20]
            if len(rscores)>20:
                rscores=sorted(rscores, key= lambda x:x[4], reverse=True)[:20]
            # #calculate products for each test session after having 20 best train sesisisisis
            train_top_session_id=[rscores[0] for rscores in rscores]
            train_rscore_products_click=[traindt2.sessions.get((tid,CART)) for tid in train_top_session_id]#list of sets
            train_rscore_products_cart=[traindt2.sessions.get((tid,CLICK)) for tid in train_top_session_id]#list of sets
            train_rscore_products_order=[traindt2.sessions.get((tid,ORDER)) for tid in train_top_session_id]#list of sets
            
            top_click.add_more(train_rscore_products_click)
            top_cart.add_more(train_rscore_products_cart)
            top_order.add_more(train_rscore_products_order)
            
            sugest_click=top_click.result()
            sugest_cart=top_cart.result()
            sugest_order=top_order.result()
            
            top_click.reset_to_top()
            top_cart.reset_to_top()
            top_order.reset_to_top()

            f.write(str(session)+"_"+CLICK.value+","+" ".join(map(str,sugest_click))+"\n")
            f.write(str(session)+"_"+CART.value+","+" ".join(map(str,sugest_cart))+"\n")
            f.write(str(session)+"_"+ORDER.value+","+" ".join(map(str,sugest_order))+"\n")
        

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [66]:
print(sugest_click)
print(sugest_click[:20-2])

[582150, 1095682, 108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037, 1603001, 673407]
[582150, 1095682, 108125, 1460571, 29735, 184976, 95488, 1502122, 1733943, 959208, 322370, 231487, 756588, 832192, 554660, 801774, 1083665, 166037]


In [None]:
# Printing stuff from dt objects
for i in range(5):
    print('Session i:',traindt.products[i][0])
    print('Contains this list:')
    list=traindt.products[i]
    for j, item in enumerate(list):
        if j == 0:
            print('Session i:',item)
        elif j == 1:
            print('Clicks:',item)
        elif j == 2:
            print('Carts:',item)
        else:
            print('Orders:',item)
            
for i in range(5):
    print('or like this:')
    print(traindt.products[i])

In [33]:
print(list(test.products.keys())[0])

(59625, CLICK)


In [5]:
print("test sessions: ",len(test.sessions))
print("train sessions: ",len(train.sessions))

test sessions:  1671803
train sessions:  400000


In [32]:
print("test.sessions: ",(test.sessions[0:5]))
print("test.aid: ",(test.aid[0:10]))


print("train.sessions: ",(train.sessions[0:5]))
print("train.aid: ",(train.aid[0:10]))

NameError: name 'test' is not defined

In [4]:
with open('train_full_dt3.dat', 'wb') as f:
    pickle.dump(train, f)

In [23]:
print(train.products[110])

print(test.products[200])

[1048733, {11840352, 11978145, 11895717, 11916787, 11669717, 11953302}, {11953302}, None]
[470, {13260258, 12949881, 13233561}, None, None]
