Goal:

Build a single entry to predict click-through, add-to-cart, and conversion rates based on previous same-session events.


    train.jsonl - the training data, which contains full session data
        session - the unique session id
        events - the time ordered sequence of events in the session
            aid - the article id (product code) of the associated event
            ts - the Unix timestamp of the event
            type - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session
    test.jsonl - the test data, which contains truncated session data
        your task is to predict the next aid clicked after the session truncation, as well as the the remaining aids that are added to carts and orders; you may predict up to 20 values for each session type
    sample_submission.csv - a sample submission file in the correct format


In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.stattools import adfuller

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from multiprocessing import Pool

import re

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Datascience Projects/Kaggle OTTO Recommender"

In [None]:
!kaggle competitions download 'otto-recommender-system'

Downloading otto-recommender-system.zip to /content
100% 1.93G/1.94G [00:21<00:00, 102MB/s] 
100% 1.94G/1.94G [00:21<00:00, 96.1MB/s]


In [None]:
!unzip -q /content/otto-recommender-system.zip -d .

In [None]:
pd.read_csv('sample_submission.csv')

Unnamed: 0,session_type,labels
0,12899779_clicks,129004 126836 118524
1,12899779_carts,129004 126836 118524
2,12899779_orders,129004 126836 118524
3,12899780_clicks,129004 126836 118524
4,12899780_carts,129004 126836 118524
...,...,...
5015404,14571580_carts,129004 126836 118524
5015405,14571580_orders,129004 126836 118524
5015406,14571581_clicks,129004 126836 118524
5015407,14571581_carts,129004 126836 118524


In [None]:
%%time
with open('test.jsonl') as f:
  out = []
  i=0
  while i<10:
    out.append(eval(f.readline()))
    print(out[-1])
    i+=1

{'session': 12899779, 'events': [{'aid': 59625, 'ts': 1661724000278, 'type': 'clicks'}]}
{'session': 12899780, 'events': [{'aid': 1142000, 'ts': 1661724000378, 'type': 'clicks'}, {'aid': 582732, 'ts': 1661724058352, 'type': 'clicks'}, {'aid': 973453, 'ts': 1661724109199, 'type': 'clicks'}, {'aid': 736515, 'ts': 1661724136868, 'type': 'clicks'}, {'aid': 1142000, 'ts': 1661724155248, 'type': 'clicks'}]}
{'session': 12899781, 'events': [{'aid': 141736, 'ts': 1661724000559, 'type': 'clicks'}, {'aid': 199008, 'ts': 1661724022851, 'type': 'clicks'}, {'aid': 57315, 'ts': 1661724170835, 'type': 'clicks'}, {'aid': 194067, 'ts': 1661724246188, 'type': 'clicks'}, {'aid': 199008, 'ts': 1661780623778, 'type': 'clicks'}, {'aid': 199008, 'ts': 1661781274081, 'type': 'clicks'}, {'aid': 199008, 'ts': 1661781409993, 'type': 'carts'}, {'aid': 199008, 'ts': 1661804151788, 'type': 'clicks'}, {'aid': 199008, 'ts': 1662060028567, 'type': 'clicks'}, {'aid': 199008, 'ts': 1662060064706, 'type': 'clicks'}, {'ai

In [None]:
%%time
with open('test.jsonl') as f:
  out = []
  i=0
  while i<10:
    out.append(f.readline())
    print(out[-1])
    i+=1

{"session":12899779,"events":[{"aid":59625,"ts":1661724000278,"type":"clicks"}]}

{"session":12899780,"events":[{"aid":1142000,"ts":1661724000378,"type":"clicks"},{"aid":582732,"ts":1661724058352,"type":"clicks"},{"aid":973453,"ts":1661724109199,"type":"clicks"},{"aid":736515,"ts":1661724136868,"type":"clicks"},{"aid":1142000,"ts":1661724155248,"type":"clicks"}]}

{"session":12899781,"events":[{"aid":141736,"ts":1661724000559,"type":"clicks"},{"aid":199008,"ts":1661724022851,"type":"clicks"},{"aid":57315,"ts":1661724170835,"type":"clicks"},{"aid":194067,"ts":1661724246188,"type":"clicks"},{"aid":199008,"ts":1661780623778,"type":"clicks"},{"aid":199008,"ts":1661781274081,"type":"clicks"},{"aid":199008,"ts":1661781409993,"type":"carts"},{"aid":199008,"ts":1661804151788,"type":"clicks"},{"aid":199008,"ts":1662060028567,"type":"clicks"},{"aid":199008,"ts":1662060064706,"type":"clicks"},{"aid":918667,"ts":1662060160406,"type":"clicks"}]}

{"session":12899782,"events":[{"aid":1669402,"ts":16

In [None]:
out[0]

'{"session":12899779,"events":[{"aid":59625,"ts":1661724000278,"type":"clicks"}]}\n'

### how many data points ?

    test size :  1671803  --> 14.44%
    train size :  9901335 --> 85.55%


In [None]:
9901335/(1671803+9901335)

0.855544537704467

In [None]:
%%time
with open('test.jsonl') as f:
  i = 0
  for line in f:
    i+=1
print('test size : ', i)

test size :  1671803
CPU times: user 738 ms, sys: 217 ms, total: 955 ms
Wall time: 2.06 s


In [None]:
%%time
with open('train.jsonl') as f:
  i = 0
  for line in f:
    i+=1

print('train size : ',i)

train size :  9901335
CPU times: user 9.9 s, sys: 5.46 s, total: 15.4 s
Wall time: 39.9 s


### how many types of events ?

    test : {'clicks': 6292632, 'carts': 570011, 'orders': 65480}
    train : {'clicks': 170383809, 'carts': 14635736, 'orders': 4397389}

    test : {'clicks': 100%, 'carts': 9.058%, 'orders': 11.487%}
    train : {'clicks': 100%, 'carts': 8.589%, 'orders': 30.045%}

    

In [None]:
out[3]

'{"session":12899782,"events":[{"aid":1669402,"ts":1661724000568,"type":"clicks"},{"aid":1494780,"ts":1661724163530,"type":"clicks"},{"aid":1494780,"ts":1661724190624,"type":"clicks"},{"aid":1494780,"ts":1661724203140,"type":"clicks"},{"aid":1494780,"ts":1661724244341,"type":"carts"},{"aid":1674681,"ts":1661724816749,"type":"clicks"},{"aid":602722,"ts":1661724885670,"type":"clicks"},{"aid":1596098,"ts":1661725306961,"type":"clicks"},{"aid":45034,"ts":1661725434870,"type":"clicks"},{"aid":603159,"ts":1661725567598,"type":"clicks"},{"aid":413962,"ts":1661765601645,"type":"clicks"},{"aid":413962,"ts":1661765608861,"type":"carts"},{"aid":413962,"ts":1661765683962,"type":"clicks"},{"aid":779477,"ts":1661765990636,"type":"clicks"},{"aid":1037537,"ts":1661766045371,"type":"clicks"},{"aid":779477,"ts":1661766058833,"type":"clicks"},{"aid":779477,"ts":1661766162910,"type":"carts"},{"aid":476063,"ts":1661766167646,"type":"clicks"},{"aid":562753,"ts":1661766178974,"type":"carts"},{"aid":779477,"t

In [None]:
pattern = re.compile(r'"type":".*?"')
print(*re.findall(pattern, out[1]), sep = '\n')

"type":"clicks"
"type":"clicks"
"type":"clicks"
"type":"clicks"
"type":"clicks"


In [None]:
%%time
with open('test.jsonl') as f:
  types = {}
  pattern = re.compile(r'"type":".*?"')
  for line in f:
    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type in types:
        types[this_type] += 1
      else:
        types[this_type] = 1
print(types)

{'clicks': 6292632, 'carts': 570011, 'orders': 65480}
CPU times: user 10.3 s, sys: 341 ms, total: 10.6 s
Wall time: 11.7 s


In [None]:
%%time
with open('train.jsonl') as f:
  types = {}
  pattern = re.compile(r'"type":".*?"')
  for line in f:
    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type in types:
        types[this_type] += 1
      else:
        types[this_type] = 1
print(types)

{'clicks': 170383809, 'carts': 14635736, 'orders': 4397389}
CPU times: user 3min 23s, sys: 11.2 s, total: 3min 34s
Wall time: 3min 39s


### how many clicks, carts, orders for each session ?

In [None]:
re.findall(r'"session":.*?,', out[3])[0].split(':')[1].rstrip(',')

'12899782'

In [None]:
%%time
with open('test.jsonl') as f:
  sessions = []
  clicks = []
  carts = []
  orders = []
  
  pattern = re.compile(r'"type":".*?"')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')
    num_clicks = 0
    num_carts = 0
    num_orders = 0

    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type=='clicks':
        num_clicks += 1
      elif this_type=='carts':
        num_carts += 1
      elif this_type=='orders':
        num_orders += 1
      else:
        raise ValueError

    sessions.append(session)
    clicks.append(num_clicks)
    carts.append(num_carts)
    orders.append(num_orders)

df_test_frequency = pd.DataFrame()
df_test_frequency['session'] = sessions
df_test_frequency['clicks'] = clicks
df_test_frequency['carts'] = carts
df_test_frequency['orders'] = orders

print(df_test_frequency.shape)
df_test_frequency.head()

(1671803, 4)
CPU times: user 14.2 s, sys: 549 ms, total: 14.7 s
Wall time: 15 s


Unnamed: 0,session,clicks,carts,orders
0,12899779,1,0,0
1,12899780,5,0,0
2,12899781,10,1,0
3,12899782,46,16,8
4,12899783,11,0,0


In [None]:
%%time
with open('train.jsonl') as f:
  sessions = []
  clicks = []
  carts = []
  orders = []
  
  pattern = re.compile(r'"type":".*?"')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')
    num_clicks = 0
    num_carts = 0
    num_orders = 0

    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type=='clicks':
        num_clicks += 1
      elif this_type=='carts':
        num_carts += 1
      elif this_type=='orders':
        num_orders += 1
      else:
        raise ValueError

    sessions.append(session)
    clicks.append(num_clicks)
    carts.append(num_carts)
    orders.append(num_orders)

df_train_frequency = pd.DataFrame()
df_train_frequency['session'] = sessions
df_train_frequency['clicks'] = clicks
df_train_frequency['carts'] = carts
df_train_frequency['orders'] = orders

print(df_train_frequency.shape)
df_train_frequency.head()

(12899779, 4)
CPU times: user 4min 12s, sys: 9.96 s, total: 4min 22s
Wall time: 4min 28s


Unnamed: 0,session,clicks,carts,orders
0,0,255,17,4
1,1,24,8,0
2,2,32,1,0
3,3,200,21,5
4,4,15,3,1


In [None]:
df_train_frequency['session'].max()

'9999999'

In [None]:
df_test_frequency['session'].min()

'12899779'

In [None]:
df_train_frequency

Unnamed: 0,session,clicks,carts,orders
0,0,255,17,4
1,1,24,8,0
2,2,32,1,0
3,3,200,21,5
4,4,15,3,1
...,...,...,...,...
12899774,12899774,2,0,0
12899775,12899775,2,0,0
12899776,12899776,2,0,0
12899777,12899777,2,0,0


### get dna of each session

In [None]:
%%time
with open('test.jsonl') as f:
  sessions = []
  dnas = []
  
  pattern = re.compile(r'"type":".*?"')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')
    dna = ''

    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type=='clicks':
        dna += 'c'
      elif this_type=='carts':
        dna += 'a'
      elif this_type=='orders':
        dna += 'o'
      else:
        raise ValueError

    sessions.append(session)
    dnas.append(dna)

df_test_dna = pd.DataFrame()
df_test_dna['session'] = sessions
df_test_dna['dna'] = dnas

print(df_test_dna.shape)
df_test_dna.head()

(1671803, 2)
CPU times: user 13.3 s, sys: 526 ms, total: 13.9 s
Wall time: 14.3 s


Unnamed: 0,session,dna
0,12899779,c
1,12899780,ccccc
2,12899781,ccccccacccc
3,12899782,ccccaccccccaccccacaccacaccccacaccccacccaaccccc...
4,12899783,ccccccccccc


In [None]:
%%time
with open('train.jsonl') as f:
  sessions = []
  dnas = []
  
  pattern = re.compile(r'"type":".*?"')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')
    dna = ''

    for i in re.findall(pattern, line):
      this_type = i.split(':')[1].strip('"')
      if this_type=='clicks':
        dna += 'c'
      elif this_type=='carts':
        dna += 'a'
      elif this_type=='orders':
        dna += 'o'
      else:
        raise ValueError

    sessions.append(session)
    dnas.append(dna)

df_train_dna = pd.DataFrame()
df_train_dna['session'] = sessions
df_train_dna['dna'] = dnas

print(df_train_dna.shape)
df_train_dna.head()

(12899779, 2)
CPU times: user 4min 44s, sys: 14.2 s, total: 4min 59s
Wall time: 5min 16s


Unnamed: 0,session,dna
0,0,ccccccaaoocccccccccccccccccccccccccccccccccacc...
1,1,acacacacccccccaccaccaccacccccccc
2,2,ccccccccccccccccccccacccccccccccc
3,3,acccaccccccccccccacooocccccccccccccccccccccccc...
4,4,ccoccccccccacacaccc


### how many unique aid are there?

    test  aid : 783486 -> 46.864% of test size
    train aid : 1855603 -> 18.741% of train size
    test size  : 1671803
    train size : 9901335

    All aid in test are subset of aid in train


In [None]:
%%time
with open('test.jsonl') as f:
  unique_aids_test = set()
  
  pattern = re.compile(r'"aid":.*?,')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')

    for i in re.findall(pattern, line):
      this_aid = int(i.split(':')[1].rstrip(','))
      unique_aids_test.add(this_aid)

print(len(unique_aids_test))

783486
CPU times: user 15 s, sys: 407 ms, total: 15.4 s
Wall time: 15.6 s


In [None]:
%%time
with open('train.jsonl') as f:
  unique_aids_train = set()
  
  pattern = re.compile(r'"aid":.*?,')
  pattern2 = re.compile(r'"session":.*?,')

  for line in f:
    session = re.findall(pattern2, line)[0].split(':')[1].rstrip(',')

    for i in re.findall(pattern, line):
      this_aid = int(i.split(':')[1].rstrip(','))
      unique_aids_train.add(this_aid)

print(len(unique_aids_train))

1855603
CPU times: user 5min 51s, sys: 10.4 s, total: 6min 1s
Wall time: 6min 24s


### Are all test aid present in train ? ---> Yes

In [None]:
unique_aids_test.issubset(unique_aids_train)

True

### How many carts are clicked before?  How many orders are  carted before? 
    
    test :
      num_carts = 570011
      num_clicked_carts:num_notclicked_carts = 458561:111450 = 4.11 : 1
      num_orders = 65480
      num_clicked_orders:num_carted_orders:num_only_orders = 49146:8523:7811 = 5.77 : 1.09 : 1
    train :
      num_carts = 16896191
      num_clicked_carts:num_notclicked_carts = 13798482:3097709 = 4.45 : 1
      num_orders = 5098951
      num_clicked_orders:num_carted_orders:num_only_orders = 4145828:677760:275363=  : 15.06: 2.46 : 1


In [None]:
%%time
with open('test.jsonl') as f:
  num_clicked_carts = 0
  num_notclicked_carts = 0
  num_carts = 0
  num_clicked_orders = 0
  num_carted_orders = 0
  num_only_orders = 0
  num_orders = 0

  pattern0 = re.compile('"events":\[.*\]')
  pattern = re.compile('{.*?}')
  pattern2 = re.compile(r'"aid":.*?,')
  pattern3 = re.compile(r'"type":".*?"')

  for line in f:
    events = re.findall(pattern0, line)[0]
    events = re.findall(pattern, events)
    clicked_aids = []
    carted_aids = []

    for event in events:
      this_aid = re.findall(pattern2, event)[0].split(':')[1].rstrip(',')
      this_type = re.findall(pattern3, event)[0].split(':')[1].strip('"')
 
      if this_type == 'clicks':
        clicked_aids.append(this_aid)

      elif this_type == 'carts':
        carted_aids.append(this_aid)
        num_carts += 1
        if this_aid in clicked_aids:
          num_clicked_carts += 1
        else:
          num_notclicked_carts += 1
      
      elif this_type == 'orders':
        num_orders += 1
        if this_aid in clicked_aids:
          num_clicked_orders += 1
        elif this_aid in carted_aids:
          num_carted_orders += 1
        else:
          num_only_orders += 1

print(num_carts, ':', num_clicked_carts, num_notclicked_carts)
print(num_orders, ':', num_clicked_orders, num_carted_orders, num_only_orders)

570011 : 458561 111450
65480 : 49146 8523 7811
CPU times: user 42.2 s, sys: 943 ms, total: 43.1 s
Wall time: 43 s


In [None]:
%%time
with open('train.jsonl') as f:
  num_clicked_carts = 0
  num_notclicked_carts = 0
  num_carts = 0
  num_clicked_orders = 0
  num_carted_orders = 0
  num_only_orders = 0
  num_orders = 0

  pattern0 = re.compile('"events":\[.*\]')
  pattern = re.compile('{.*?}')
  pattern2 = re.compile(r'"aid":.*?,')
  pattern3 = re.compile(r'"type":".*?"')

  for line in f:
    events = re.findall(pattern0, line)[0]
    events = re.findall(pattern, events)
    clicked_aids = []
    carted_aids = []

    for event in events:
      this_aid = re.findall(pattern2, event)[0].split(':')[1].rstrip(',')
      this_type = re.findall(pattern3, event)[0].split(':')[1].strip('"')
 
      if this_type == 'clicks':
        clicked_aids.append(this_aid)

      elif this_type == 'carts':
        carted_aids.append(this_aid)
        num_carts += 1
        if this_aid in clicked_aids:
          num_clicked_carts += 1
        else:
          num_notclicked_carts += 1
      
      elif this_type == 'orders':
        num_orders += 1
        if this_aid in clicked_aids:
          num_clicked_orders += 1
        elif this_aid in carted_aids:
          num_carted_orders += 1
        else:
          num_only_orders += 1

print(num_carts, ':', num_clicked_carts, num_notclicked_carts)
print(num_orders, ':', num_clicked_orders, num_carted_orders, num_only_orders)

16896191 : 13798482 3097709
5098951 : 4145828 677760 275363
CPU times: user 19min 26s, sys: 15.1 s, total: 19min 41s
Wall time: 20min 8s
