In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from src.preprocess.utils import rename_cols
from src.preprocess.pipeline import preprocess

from src.splits import LeaveOneOutSplitter, GlobalTimeSplitter

from src.stats.base import base_stats, compare_subsets
from src.stats.duplicates import duplicates_stats
from src.stats.leaks import get_leaks, leak_counts
from src.stats.cold import get_cold, cold_counts, cold_stats
from src.stats.temporal import time_counts
from src.stats.plots import plot_inters_px, plot_inters_scatter

In [110]:
raw = pd.read_csv('data/Diginetica/raw.csv')

# Rename columns for consistency (sessionId -> user_id, itemId -> item_id, datetime -> timestamp)
raw = rename_cols(raw, user_id='sessionId', item_id='itemId', timestamp='datetime')

# Base statistics

In [111]:
# Display basic statistics of the raw dataset
base_stats(raw)

Unnamed: 0,n_users,n_items,n_interactions,avg_seq_length,density,timeframe
0,310324,122993,1235380,3.980936,3.2e-05,13133990.0


As can be seen, the data is very sparse. At the same time, the average sequence length for each user is quite short, making it challenging to train sequential models effectively due to insufficient context.



# Duplicates analysis

In [112]:
# Analyze duplicates in the raw dataset
duplicates_stats(raw)

Unnamed: 0,Consecutive item duplicates,Non-unique item interactions
Number of Users,50057.0,86439.0
Share of Users,0.161306,0.278544
Avg. Number per user,1.443654,1.947188
Avg. Share per user,0.294775,0.307073


The Diginetica dataset contains a noticeable number of duplicate interactions: 16% of users have consecutive item repeats, and 28% have item repeats in general.

On average, users with consecutive duplicates have around 1.4 repeated items, which make up about 29% of their interactions. This could be problematic for sequential models, as they might end up learning to predict the same item multiple times.

Let's apply 5-core filter to filter out short sequences and consecutive item repeats.

In [113]:
# Preprocessing: 5-core filter to have longer sequences, remove consecutive duplicates.
preprocessed = preprocess(data=raw, relevance=None, drop_conseq_repeats=True, verbose=False)

In [114]:
# Analyze duplicates in the preprocessed dataset
duplicates_stats(preprocessed)

Unnamed: 0,Consecutive item duplicates,Non-unique item interactions
Number of Users,0.0,30845.0
Share of Users,0.0,0.503354
Avg. Number per user,0.0,2.078003
Avg. Share per user,0.0,0.236929


The updated statistics show that consecutive item duplicates have been completely removed (0% of users). 

However, we still see plenty of non-consecutive repeats. About half of all users (50.3%) revisit items at some point, with an average of 2 repeats per user. Interestingly, while more users show this behavior now compared to before, these repeats make up a smaller portion of their overall activity (23.7% vs previous 30.7%). This is probably due to increased sequence lengths and improved variability.  

Finally, the near-elimination of timestamp duplicates confirms better data integrity.

In [115]:
# Compare base statistics for both data variants
compare_subsets([raw, preprocessed], subset_names=['Raw', 'Prepr.'])

Unnamed: 0,n_users,n_items,n_interactions,avg_seq_length,density,timeframe
Raw,310324.0,122993.0,1235380.0,3.980936,3.2e-05,13133990.0
Prepr.,61279.0,25593.0,485903.0,7.929356,0.00031,13133970.0


After preprocessing, we still have sufficient number of user-item interactions to properly train a model. However, the data is now cleaner for sequential modeling, with no consecutive noise but preserved signals of revisitation behavior. 

# Splitting: Leave-One-Out

Let's consider leave-last-out splitting as one of the most popular splitting startegies. In this setup the division is the following:
- Train: All interactions except the last two.
- Validation holdout: The second-to-last interaction.
- Test holdout: The last interaction.


In [116]:
splitter = LeaveOneOutSplitter()

train, validation_input, validation_target, test_input, test_target = splitter.split(preprocessed)

# Data leakage analysis

Let's analyze the presence of data leaks in the resulting split. A leak occurs when a test or validation interaction takes place on or before the most recent timestamp of the same item_id in the reference_data (usually the training set).

In [117]:
# Detect leaks between test_target and train
get_leaks(test_target, train).head(5)

Unnamed: 0,user_id,item_id,timestamp,timestamp_ref_max,is_leak
0,0,9743,2016-05-09 00:18:32.408000000,2016-05-11 00:06:52.336999936,True
1,1,22666,2016-05-09 00:17:25.017999872,2016-05-24 00:00:22.652000000,True
2,2,11812,2016-05-09 00:07:12.980000000,2016-05-24 00:00:21.941999872,True
3,3,4488,2016-04-09 00:13:43.477999872,2016-05-24 00:17:20.144999936,True
4,4,18827,2016-04-03 00:07:53.943000064,2016-05-12 00:18:10.152000000,True


In [118]:
# Count total number of leaks: 89% of test_target interactions are leaks
leak_counts(test_target, train)

{'total_interactions': 61279,
 'leak_interactions': 54550,
 'leak_share': 0.8901907668206074}

In [119]:
# Count number of leaks per each month (or other granularity) in test_target subset
leak_counts(test_target, train, 'm')

{'total_interactions': timestamp
 2016-01-31      273
 2016-02-29     9355
 2016-03-31    13043
 2016-04-30    19100
 2016-05-31    18809
 2016-06-30      699
 Freq: M, Name: item_id, dtype: int64,
 'leak_interactions': timestamp
 2016-01-31      273
 2016-02-29     9272
 2016-03-31    12665
 2016-04-30    17942
 2016-05-31    14347
 2016-06-30       51
 Freq: M, Name: is_leak, dtype: int64,
 'leak_share': timestamp
 2016-01-31    1.000000
 2016-02-29    0.991128
 2016-03-31    0.971019
 2016-04-30    0.939372
 2016-05-31    0.762773
 2016-06-30    0.072961
 Freq: M, dtype: float64}

# Splitting: Global Temporal Split

Due to data leakage, the model gains access to future or otherwise unavailable information, which may cause unrealistically high performance during evaluation but poor real-world generalization.

To prevent that, let's consider global temporal split that divides the dataset **based on timestamps** rather than user sessions. 
Here, all interactions before a certain quantile of the global timestamp distribution are assigned to the **training set**, while the rest are used for **validation** and **test**. This approach better reflects real-world scenarios, where future events should not influence model training.

By default, `target_type="all"`, which means that the holdout set includes all interactions occurring after the temporal threshold. In this setup, the test set consists of users who have at least one interaction before the threshold (input part) and one or more interactions after it (target part).

In [120]:
splitter = GlobalTimeSplitter(quantile=0.9, target_type='all') # 90th percentile as a threshold

train, validation_input, validation_target, test_input, test_target = splitter.split(preprocessed)

leak_counts(test_target, train)

{'total_interactions': 48591, 'leak_interactions': 0, 'leak_share': 0.0}

Indeed, we managed to eliminate data leakage! 

But what about data statistics?

In [121]:
compare_subsets([test_target, test_input, train], preprocessed, subset_names=['test target', 'test input', 'train'])

Unnamed: 0_level_0,n_users,n_users,n_items,n_items,n_interactions,n_interactions,avg_seq_length,avg_seq_length,density,density,timeframe,timeframe
Unnamed: 0_level_1,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%
test target,6346.0,10.36,13885.0,54.25,48591.0,10.0,7.656949,96.56,0.000551,177.99,777849.6,5.92
test input,216.0,0.35,1182.0,4.62,1541.0,0.32,7.134259,89.97,0.006036,1948.12,942.777,0.01
train,54648.0,89.18,25575.0,99.93,433360.0,89.19,7.930025,100.01,0.00031,100.08,12356120.0,94.08


# Cold entities analysis

Since user interaction sequences in the Diginetica dataset are relatively short in time, only a small number of users have interactions that occur before the temporal threshold and can therefore serve as input.

The remaining users lack any historical activity prior to the split and consequently provide no usable context — these are referred to as **cold users**, meaning users who appear only in the holdout (test) portion of the data without prior interactions in the training set.


In [122]:
get_cold(test_target, train, 'user_id').head(5) # get cold flags: test_target compared to train

Unnamed: 0,user_id,item_id,timestamp,is_cold
356,37,2820,1464134000.0,True
357,37,2697,1464135000.0,True
358,37,10897,1464135000.0,True
359,37,5301,1464135000.0,True
360,37,5510,1464135000.0,True


In [123]:
cold_stats(test_target, train) # too many cold users, they account for 99% interactions

Unnamed: 0,Number,Share (by count),Share (by interactions)
Cold Users,6133,0.966436,0.990101
Cold Items,18,0.001296,0.002387


In [124]:
cold_counts(test_target, train, 'user_id') # get number of cold interactions

{'total_interactions': 48591,
 'cold_interactions': 48110,
 'cold_share': 0.9901010475190879}

## Adjusting for cold users and items

We attempt to **remove cold users and items** entirely from the test split to create a cleaner evaluation scenario.  
However, this usually reduces the number of interactions drastically, so we’ll verify how much data remains after such filtering.

In [125]:
splitter = GlobalTimeSplitter(quantile=0.9, remove_cold_users=True, remove_cold_items=True)
train, validation_input, validation_target, test_input, test_target = splitter.split(preprocessed)

In [126]:
compare_subsets([test_target, test_input, train], preprocessed, subset_names=['test target', 'test input', 'train'])

Unnamed: 0_level_0,n_users,n_users,n_items,n_items,n_interactions,n_interactions,avg_seq_length,avg_seq_length,density,density,timeframe,timeframe
Unnamed: 0_level_1,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%
test target,216.0,0.35,454.0,1.77,492.0,0.1,2.277778,28.73,0.005017,1619.34,249.288,0.0
test input,216.0,0.35,1182.0,4.62,1541.0,0.32,7.134259,89.97,0.006036,1948.12,942.777,0.01
train,54648.0,89.18,25575.0,99.93,433360.0,89.19,7.930025,100.01,0.00031,100.08,12356120.0,94.08


In [127]:
cold_stats(test_target, train)

Unnamed: 0,Number,Share (by count),Share (by interactions)
Cold Users,3,0.013889,0.02439
Cold Items,0,0.0,0.0


In [128]:
cold_counts(test_target, train, 'user_id') 

{'total_interactions': 492,
 'cold_interactions': 12,
 'cold_share': 0.024390243902439025}

## Alternative approach: Changing target type

Instead of filtering out cold entities, we modify the **target selection strategy**.


By setting `target_type='random'`, a random interaction is chosen as the target for each user, while all preceding interactions are used as input..  
This approach preserves enough interaction history in the test_input set for meaningful prediction, while still maintaining overall temporal consistency.

Other possible target types are `first` and `last`, which select the earliest or most recent interaction after the threshold as the target, respectively.

In [129]:
splitter = GlobalTimeSplitter(quantile=0.9, remove_cold_items=True, target_type='random')
train, validation_input, validation_target, test_input, test_target = splitter.split(preprocessed)

In [130]:
cold_stats(test_target, train) # compare test_target against test_input

Unnamed: 0,Number,Share (by count),Share (by interactions)
Cold Users,6133,0.966436,0.966436
Cold Items,0,0.0,0.0


In [131]:
cold_stats(test_target, test_input) # compare test_target against test_input

Unnamed: 0,Number,Share (by count),Share (by interactions)
Cold Users,0,0.0,0.0
Cold Items,1357,0.29936,0.232903


As expected, some users in the target set now have all of their interactions occurring after the temporal threshold. These users appear in the `test_input` portion but are absent from the training set.

In [132]:
compare_subsets([test_target, test_input, train], preprocessed, subset_names=['test target', 'test input', 'train'])

Unnamed: 0_level_0,n_users,n_users,n_items,n_items,n_interactions,n_interactions,avg_seq_length,avg_seq_length,density,density,timeframe,timeframe
Unnamed: 0_level_1,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%,Abs. value,%
test target,6346.0,10.36,4533.0,17.71,6346.0,1.31,1.0,12.61,0.000221,71.2,777846.1,5.92
test input,6346.0,10.36,9895.0,38.66,25735.0,5.3,4.05531,51.14,0.00041,132.28,778723.6,5.93
train,54648.0,89.18,25575.0,99.93,433360.0,89.19,7.930025,100.01,0.00031,100.08,12356120.0,94.08


# Temporal statistics

Here, we observe another potential issue — the Diginetica dataset covers only the first six months of the year. This limited time span hinders recommendation quality in the long run, as it prevents the model from capturing seasonal patterns or yearly trends.

In [133]:
time_counts(train, granularity='m', normalize=True) # calculate interactions by month

Unnamed: 0,month,n_inters
0,1,0.004724
1,2,0.168996
2,3,0.232656
3,4,0.352058
4,5,0.241566


What’s even more problematic is that the target set contains interactions from July — a month that never appears in the training data.

In [134]:
time_counts(test_target, granularity='m', normalize=True)

Unnamed: 0,month,n_inters
0,5,0.889852
1,6,0.110148


In [135]:
plot_inters_px([train, test_target], granularity="m", labels=["Train", "Test Target"])