In [1]:

!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / - \ | / - \ | / done
[?25h  Getting requirements to build wheel ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp39-cp39-macosx_10_15_x86_64.whl size=568159 sha256=870fbc2a956d5601bb307f98286467fb81960c70dcab352b9a2310da4e6af51d
  Stored i

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import random

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

from tqdm import tqdm

  user_id  article_id  scroll_percentage
0  user_1         101                 50
0  user_1         102                 60
0  user_1         103                 70
1  user_2         104                 80
1  user_2         105                 90


In [10]:
df_b = pd.read_parquet('ebnerd_small/train/behaviors.parquet')
df_h = pd.read_parquet('ebnerd_small/train/history.parquet')
print(df_b.shape)
df_h_e = df_h.explode('article_id_fixed')
print(df_h_e.shape)
df_b_exploded = df_b.explode('article_ids_inview')
print(df_b_exploded.shape)
# Create a new column that zips article_id_fixed and scroll_percentage_fixed
df_h['zipped'] = df_h.apply(lambda row: list(zip(row['article_id_fixed'], row['scroll_percentage_fixed'])), axis=1)

# Explode the new zipped column
df_exploded = df_h.explode('zipped')

# Split the zipped column into two separate columns
df_exploded[['article_id', 'scroll_percentage']] = pd.DataFrame(df_exploded['zipped'].tolist(), index=df_exploded.index)

# Drop the unnecessary columns
df_exploded = df_exploded.drop(columns=['article_id_fixed', 'scroll_percentage_fixed', 'zipped', 'impression_time_fixed', 'read_time_fixed'])

df_exploded = df_exploded.dropna(subset=['scroll_percentage'])

df_exploded.columns = ['user_id', 'article_ids_inview', 'rating']

# Display the DataFrame
print(df_exploded)

(232887, 17)
(2426247, 5)
(2585747, 17)
       user_id  article_id  scroll_percentage
0        13538     9738663              100.0
0        13538     9738569               35.0
0        13538     9738663              100.0
0        13538     9738490               24.0
0        13538     9738663              100.0
...        ...         ...                ...
15142  1710834     9770741               20.0
15142  1710834     9770594               43.0
15142  1710834     9728166               99.0
15142  1710834     9769433               99.0
15142  1710834     9770452               48.0

[2171171 rows x 3 columns]


In [11]:
reader = Reader(rating_scale=(0, 100))

data = Dataset.load_from_df(df_exploded[['user_id', 'article_ids_inview', 'rating']], reader)

trainset = data.build_full_trainset()

# Define the SVD algorithm
algo = SVD()

# Train the algorithm on the training set
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1775d2370>

In [12]:
df_val = pd.read_parquet('ebnerd_small/validation/behaviors.parquet')

df_val['predicted_article_id'] = df_val['article_ids_clicked'].apply(lambda ids: ids[0])

In [13]:
predicted100p = 0
guessed_right = 0
guessed_wrong = 0
guessed1_but_wrong = 0
for i, row in tqdm(df_val.iterrows()):
    user = row["user_id"]
    inview = row["article_ids_inview"]
    actuals = row["article_ids_clicked"]
    exploded = pd.DataFrame({
    'user_id': [user] * len(inview),
    'article_ids_inview': inview,
    'fake': 0
    })
    for_pred = list(exploded.itertuples(index=False, name=None))

    predictions_val = algo.test(for_pred)
    predictions_df_ans = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions_val],
                              columns=['uid', 'iid', 'est'])
    predictions_df_ans = predictions_df_ans.sort_values(by='est', ascending=False).reset_index(drop=True)
    max_est = predictions_df_ans['est'].max()
    filtered_df = predictions_df_ans[predictions_df_ans['est'] == max_est].reset_index(drop=True)

    if len(filtered_df['iid']) == 1 and filtered_df['iid'][0] in actuals:
        predicted100p += 1
    elif len(filtered_df['iid']) == 1:
        guessed1_but_wrong += 1
        guessed_wrong += 1
    elif random.choice(filtered_df['iid']) in actuals:
        guessed_right += 1
    else:
        guessed_wrong += 1
    if i % 25000 == 0:
        print(f"Predicted 100%: {predicted100p}")
        print(f"Right: {guessed_right}")
        print(f"Wrong: {guessed_wrong}")
        print(f"All correct: {predicted100p + guessed_right}")
        print(f"Guessed only one, but it is wrong: {guessed1_but_wrong}")

print(f"Predicted 100%: {predicted100p}")
print(f"Right: {guessed_right}")
print(f"Wrong: {guessed_wrong}")
print(f"All correct: {predicted100p + guessed_right}")


61it [00:01, 56.88it/s]

Predicted 100%: 0
Right: 1
Wrong: 0
All correct: 1
Guessed only one, but it is wrong: 0


25215it [00:32, 768.20it/s]

Predicted 100%: 7
Right: 2648
Wrong: 22346
All correct: 2655
Guessed only one, but it is wrong: 994


50175it [01:04, 782.63it/s]

Predicted 100%: 20
Right: 5352
Wrong: 44629
All correct: 5372
Guessed only one, but it is wrong: 1973


75100it [01:35, 792.40it/s]

Predicted 100%: 33
Right: 8074
Wrong: 66894
All correct: 8107
Guessed only one, but it is wrong: 2970


100244it [02:06, 827.19it/s]

Predicted 100%: 39
Right: 10861
Wrong: 89101
All correct: 10900
Guessed only one, but it is wrong: 4009


125110it [02:36, 842.58it/s]

Predicted 100%: 54
Right: 13592
Wrong: 111355
All correct: 13646
Guessed only one, but it is wrong: 5009


150253it [03:11, 835.41it/s]

Predicted 100%: 70
Right: 16242
Wrong: 133689
All correct: 16312
Guessed only one, but it is wrong: 6024


175170it [03:42, 841.26it/s]

Predicted 100%: 80
Right: 18935
Wrong: 155986
All correct: 19015
Guessed only one, but it is wrong: 6957


200250it [04:13, 844.84it/s]

Predicted 100%: 94
Right: 21587
Wrong: 178320
All correct: 21681
Guessed only one, but it is wrong: 8029


225156it [04:43, 821.82it/s]

Predicted 100%: 109
Right: 24163
Wrong: 200729
All correct: 24272
Guessed only one, but it is wrong: 9048


244647it [05:08, 792.50it/s]

Predicted 100%: 120
Right: 26281
Wrong: 218246
All correct: 26401





In [14]:
print ((predicted100p + guessed_right) / (predicted100p + guessed_right + guessed_wrong))


0.10791466889027865
