In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

##### Worked this version from [this notebook](https://www.kaggle.com/code/radek1/co-visitation-matrix-simplified-imprvd-logic). Discussion about co-visitation logic [here](https://www.kaggle.com/competitions/otto-recommender-system/discussion/364210) and info about co-visitation matrix also [here](https://www.kaggle.com/code/vslaykovsky/co-visitation-matrix).

In [None]:
#First implementation is to try to create a co-visitation matrix of products,
#meaning products that are frequently viewed and bought together (close in time).
#Imports
import pandas as pd 
from tqdm.notebook import tqdm
from collections import defaultdict
import os 
import multiprocessing
from collections import Counter
import sys
import gc
import glob


In [None]:
#create just a sample of the training parquet 0 for now 
train_0 = pd.read_parquet('/kaggle/input/training-data/training/training0.parquet')
print('Training now looks like this:\n',train_0.head())
sessions = train_0['session'].unique()
print('Sessions are:',sessions)
import random
sample = random.sample(sessions.tolist(),len(sessions)//10)
print('Some sample sessions are: \n',sample[0:5])

In [None]:
data_sample = train_0.loc[train_0.session.isin(sample)]
data_sample.head(5)

In [None]:
data_sample.shape

In [None]:
#Create the pairs aka the 'candidates'.
#function to create the pairs 
#it says to groupby session and then for each session's ONLY last 30 transactions (makes the dataset smaller) to merge them together.
#Now that we have them side by side,we try to find the pairs that are close to each other in time (<1 day) and keep the session and the pair.
#Of course take into account that we don't want the same things as pairs and also no duplicates.
def gen_pairs(df):
    df = df.groupby('session',as_index = False,sort = False).apply(lambda g: g.tail(30)).reset_index(drop = True)
    df = pd.merge(df,df,on = 'session')
    pairs = df.query('abs(ts_x-ts_y)< 24*60*60*1000 and aid_x!=aid_y')[['session','aid_x','aid_y']].drop_duplicates()
    
    return pairs[['aid_x', 'aid_y']].values

In [None]:
%%time
data = gen_pairs(data_sample)


In [None]:
data.head()

In [None]:
#There is a way to do this with all the chunks aka training parquets that we have using glob,garbage collector (gc)
#for one parquet CPU reached almost 400% and 8 GB Ram so it takes this implementation a lot of time 
def gen_aid_pairs():
    all_pairs = defaultdict(lambda:Counter())
    all_pair_chunks = []
    with tqdm(glob.glob('/kaggle/input/training-data/training/*.parquet'),) as prog:
        for idx,file in enumerate(prog):
            with multiprocessing.Pool() as p:
                chunk = pd.read_parquet(file)
                print(file)
                #to givw tasks to the pool
                pair_chunks = p.map(gen_pairs,np.array_split(chunk,120))
                pair_chunks = np.concatenate(pair_chunks, axis=0)
                print(pair_chunks)
                all_pair_chunks.append(pair_chunks)
                del chunk,pair_chunks
                gc.collect()
                break
    df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
    top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()
    return top_aids          

In [None]:
%%time
top_40 = gen_aid_pairs()

In [None]:
#top_40 is a dict 
top_40[3]

In [None]:
top_40_cnt = {aid: Counter(dict(top)) for aid, top in top_40.items()}

In [None]:
top_40_cnt[3]

In [None]:
#Let's check this with our testing data
test = pd.read_parquet('/kaggle/input/testing-data/testing/testing_final0.parquet')
test

In [None]:
import itertools
def suggest_aids(df):
    #get the last 20 products
    aids = df.tail(20).aid.tolist()
    if(len(aids)>=20):
        return aids
    aids = set(aids)
    new_aids = Counter()
    for aid in aids:
        new_aids.update(top_40_cnt.get(aid,Counter()))
    top_aids2 = [aid2 for aid2, cnt in new_aids.most_common(20) if aid2 not in aids]        
    return list(aids) + top_aids2[:20 - len(aids)]
#     print(aids)

In [None]:
#alternative implementation with a boost 
import itertools

def suggest_aids(df):
    # REMOVE DUPLICATE AIDS AND REVERSE ORDER OF LIST
    aids = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    
    if len(aids) >= 20:
        # We have enough events in the test session
        return aids[:20]
    
    # Append it with AIDs from the co-visitation matrix. 
    aids2 = list(itertools.chain(*[top_20[aid] for aid in aids if aid in top_20]))
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in aids]        
    return list(aids) + top_aids2[:20 - len(aids)]

extra optimizations [here](https://www.kaggle.com/code/cdeotte/test-data-leak-lb-boost/notebook?scriptVersionId=110154433)

In [None]:
%%time
predictions_df = test.sort_values(["session", "type", "ts"]).groupby('session').apply(lambda x: suggest_aids(x))
predictions_df.count()

In [None]:
#do that to create more instances - if we don't have events in a session based on 
predictions_df.head()

In [None]:
clicks_pred_df = pd.DataFrame(predictions_df.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(predictions_df.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(predictions_df.add_suffix("_carts"), columns=["labels"]).reset_index()

In [None]:
clicks_pred_df

In [None]:
pred_df = pd.concat(
    [clicks_pred_df, orders_pred_df, carts_pred_df]
)
pred_df


In [None]:
pred_df.columns = ["session_type", "labels"]
pred_df


In [None]:
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df

In [None]:
pred_df.info()


In [None]:
pred_df.to_csv("submission.csv", index=False)
#this is the way the submission needs to be made 
#can create different type of pairs based on all the training and give a first submission.

#### Continue another co-visitation from [this notebook](https://www.kaggle.com/code/radek1/co-visitation-matrix-simplified-imprvd-logic)
Additional information about possible weighting by type [here](https://www.kaggle.com/code/ingvarasgalinskas/item-type-vs-multiple-clicks-vs-latest-items)           
Info about voting ensemble [here](https://www.kaggle.com/code/radek1/2-methods-how-to-ensemble-predictions)              
Info about simple version [here](https://www.kaggle.com/code/tomooinubushi/test-dataset-is-all-we-need/notebook)              

In [None]:
!pip install polars #memory efficient library 
#also good for parallel work with functions like apply, use pandarallel
!pip install pandarallel
from pandarallel import pandarallel
pandarallel.initialize(progress_bar = True)
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import polars as pl
#to read files and ensemble predictions later for all the parquet files 
#works like sql

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_sample.head(3)