In [1]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

tail = 30
parallel = 1024
topn = 20
ops_weights = np.array([1.0, 6.0, 3.0])
OP_WEIGHT = 0; TIME_WEIGHT = 1
parallel = 1024
test_ops_weights = np.array([1.0, 6.0, 3.0])

In [2]:
path = 'C:/Users/ghtyu/OneDrive/Desktop/OTTO/otto_data/LB_574_Dataset_Tony'
df = pd.read_csv(os.path.join(path,"train.csv"))
df_test = pd.read_csv(os.path.join(path,"test.csv"))
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load(os.path.join(path,"train.npz"))
npz_test = np.load(os.path.join(path,"test.npz"))
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["idx"] = np.cumsum(df.length) - df.length
df["end_time"] = df.start_time + ts[df.idx + df.length - 1]

In [3]:
df.head(50)  # session / start_time / length / idx / end_time

Unnamed: 0,session,start_time,length,idx,end_time
0,0,1659304800,276,0,1661684983
1,1,1659304800,32,276,1661714854
2,2,1659304800,33,308,1661714215
3,3,1659304800,226,341,1661109666
4,4,1659304800,19,567,1661586681
5,5,1659304800,15,586,1660348787
6,6,1659304800,204,601,1661549531
7,7,1659304800,23,805,1660538518
8,8,1659304800,4,828,1659304839
9,9,1659304800,7,832,1659648132


In [4]:
# get pair dict {(aid1, aid2): weight} for each session
# The maximum time span between two points is 1 day = 24 * 60 * 60 sec
@nb.jit(nopython = True, cache = True)
def get_single_pairs(pairs, aids, ts, ops, idx, length, start_time, ops_weights, mode):
    max_idx = idx + length
    min_idx = max(max_idx - tail, idx)
    for i in range(min_idx, max_idx):
        for j in range(i + 1, max_idx):
            if ts[j] - ts[i] >= 24 * 60 * 60: break
            if aids[i] == aids[j]: continue
            if mode == OP_WEIGHT:
                w1 = ops_weights[ops[j]]
                w2 = ops_weights[ops[i]]
            elif mode == TIME_WEIGHT:
                w1 = 1 + 3 * (ts[i] + start_time - 1659304800) / (1662328791 - 1659304800)
                w2 = 1 + 3 * (ts[j] + start_time - 1659304800) / (1662328791 - 1659304800)
            pairs[(aids[i], aids[j])] = w1
            pairs[(aids[j], aids[i])] = w2

# get pair dict of each session in parallel
# merge pairs into a nested dict format (cnt)
@nb.jit(nopython = True, parallel = True, cache = True)
def get_pairs(aids, ts, ops, row, cnts, ops_weights, mode):
    par_n = len(row)
    pairs = [{(0, 0): 0.0 for _ in range(0)} for _ in range(par_n)]
    for par_i in nb.prange(par_n):
        _, idx, length, start_time = row[par_i]
        get_single_pairs(pairs[par_i], aids, ts, ops, idx, length, start_time, ops_weights, mode)
    for par_i in range(par_n):
        for (aid1, aid2), w in pairs[par_i].items():
            if aid1 not in cnts: cnts[aid1] = {0: 0.0 for _ in range(0)}
            cnt = cnts[aid1]
            if aid2 not in cnt: cnt[aid2] = 0.0
            cnt[aid2] += w
    
# util function to get most common keys from a counter dict using min-heap
# overwrite == 1 means the later item with equal weight is more important
# otherwise, means the former item with equal weight is more important
# the result is ordered from higher weight to lower weight
@nb.jit(nopython = True, cache = True)
def heap_topk(cnt, overwrite, cap):
    q = [(0.0, 0, 0) for _ in range(0)]
    for i, (k, n) in enumerate(cnt.items()):
        if overwrite == 1:
            heapq.heappush(q, (n, i, k))
        else:
            heapq.heappush(q, (n, -i, k))
        if len(q) > cap:
            heapq.heappop(q)
    return [heapq.heappop(q)[2] for _ in range(len(q))][::-1]
   
# save top-k aid2 for each aid1's cnt
@nb.jit(nopython = True, cache = True)
def get_topk(cnts, topk, k):
    for aid1, cnt in cnts.items():
        topk[aid1] = np.array(heap_topk(cnt, 1, k))

In [5]:
topks = {}

# for two modes
for mode in [OP_WEIGHT, TIME_WEIGHT]:
    # get nested counter
    cnts = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
    max_idx = len(df)
    for idx in tqdm(range(0, max_idx, parallel)):
        row = df.iloc[idx:min(idx + parallel, max_idx)][['session', 'idx', 'length', 'start_time']].values
        get_pairs(aids, ts, ops, row, cnts, ops_weights, mode)

    # get topk from counter
    topk = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.types.int64[:])
    get_topk(cnts, topk, topn)

    del cnts; gc.collect()
    topks[mode] = topk

  0%|          | 0/14231 [00:00<?, ?it/s]

  0%|          | 0/14231 [00:00<?, ?it/s]

In [42]:
print(len(df))

14571582


In [10]:
topks

In [7]:
@nb.jit(nopython = True, cache = True)
def inference_(aids, ops, row, result, topk, test_ops_weights, seq_weight):
    for session, idx, length in row:
        unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
        cnt = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
        
        candidates = aids[idx:idx + length][::-1]
        candidates_ops = ops[idx:idx + length][::-1]
        for a in candidates:
            unique_aids[a] = 0
                
        if len(unique_aids) >= 20:
            sequence_weight = np.power(2, np.linspace(seq_weight, 1, len(candidates)))[::-1] - 1
            for a, op, w in zip(candidates, candidates_ops, sequence_weight):
                if a not in cnt: cnt[a] = 0
                cnt[a] += w * test_ops_weights[op]
            result_candidates = heap_topk(cnt, 0, 20)
        else:
            result_candidates = list(unique_aids)
            for a in result_candidates:
                if a not in topk: continue
                for b in topk[a]:
                    if b in unique_aids: continue
                    if b not in cnt: cnt[b] = 0
                    cnt[b] += 1
            result_candidates.extend(heap_topk(cnt, 0, 20 - len(result_candidates)))
        result[session] = np.array(result_candidates)
        
@nb.jit(nopython = True)
def inference(aids, ops, row, 
              result_clicks, result_buy,
              topk_clicks, topk_buy,
              test_ops_weights):
    inference_(aids, ops, row, result_clicks, topk_clicks, test_ops_weights, 0.1)
    inference_(aids, ops, row, result_buy, topk_buy, test_ops_weights, 0.5)

In [52]:
sum = 0
for result, op in zip([result_clicks, result_buy, result_buy], op_names):
    print(result.keys(), result.values())
    sum +=1 
    if sum == 30:
        break

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [53]:
type(result_buy)

numba.typed.typeddict.Dict

In [36]:
# result place holder
result_clicks = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])
result_buy = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])
for idx in tqdm(range(len(df) - len(df_test), len(df), parallel)):
    row = df.iloc[idx:min(idx + parallel, len(df))][['session', 'idx', 'length']].values
    inference(aids, ops, row, result_clicks, result_buy, topks[TIME_WEIGHT], topks[OP_WEIGHT], test_ops_weights)

  0%|          | 0/1633 [00:00<?, ?it/s]

In [9]:
subs = []
op_names = ["clicks", "carts", "orders"]
for result, op in zip([result_clicks, result_buy, result_buy], op_names):

    sub = pd.DataFrame({"session_type": result.keys(), "labels": result.values()})
    sub.session_type = sub.session_type.astype(str) + f"_{op}"
    sub.labels = sub.labels.apply(lambda x: " ".join(x.astype(str)))
    subs.append(sub)
    
sub = pd.concat(subs).reset_index(drop = True)
sub.to_csv('submission.csv', index = False)
sub.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 166...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...


## 576 CODE

In [48]:
conda create env


Note: you may need to restart the kernel to use updated packages.



CondaValueError: The target prefix is the base prefix. Aborting.



In [47]:
!pip install cudf-cuda100

Collecting cudf-cuda100
  Downloading cudf-cuda100-0.6.1.post1.tar.gz (1.1 kB)
Building wheels for collected packages: cudf-cuda100
  Building wheel for cudf-cuda100 (setup.py): started
  Building wheel for cudf-cuda100 (setup.py): finished with status 'error'
  Running setup.py clean for cudf-cuda100
Failed to build cudf-cuda100
Installing collected packages: cudf-cuda100
    Running setup.py install for cudf-cuda100: started
    Running setup.py install for cudf-cuda100: finished with status 'error'


  ERROR: Command errored out with exit status 1:
   command: 'C:\Users\ghtyu\anaconda3\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\ghtyu\\AppData\\Local\\Temp\\pip-install-8agb93d6\\cudf-cuda100_e6fab67280654a948c2128d24d12e108\\setup.py'"'"'; __file__='"'"'C:\\Users\\ghtyu\\AppData\\Local\\Temp\\pip-install-8agb93d6\\cudf-cuda100_e6fab67280654a948c2128d24d12e108\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\ghtyu\AppData\Local\Temp\pip-wheel-v2tzykax'
       cwd: C:\Users\ghtyu\AppData\Local\Temp\pip-install-8agb93d6\cudf-cuda100_e6fab67280654a948c2128d24d12e108\
  Complete output (33 lines):
  running bdist_wheel
  running build
  installing to build\bdist.win-amd64\wheel
  running install

In [45]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

ModuleNotFoundError: No module named 'cudf'