In [1]:

!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / - \ | / - \ | / done
[?25h  Getting requirements to build wheel ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l- \ | / - \ | / - done
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp39-cp39-macosx_10_15_x86_64.whl size=568159 sha256=870fbc2a956d5601bb307f98286467fb81960c70dcab352b9a2310da4e6af51d
  Stored i

In [53]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import random

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

from tqdm import tqdm

In [58]:

df = pd.read_parquet('ebnerd_small/train/behaviors.parquet')

df['predicted_article_id'] = df['article_ids_clicked'].apply(lambda ids: ids[0])
df_exploded = df.explode('article_ids_inview')

# Add a column to indicate if the article was recommended
df_exploded['rating'] = (df_exploded['predicted_article_id'] == df_exploded['article_ids_inview']).astype(int)

def find_mode(series):
    return mode(series).mode

mode_ratings = df_exploded.groupby(['user_id', 'article_ids_inview'])['rating'].agg(find_mode).reset_index()

# Rename columns for clarity
mode_ratings.columns = ['user_id', 'article_ids_inview', 'rating']

In [61]:
reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(mode_ratings[['user_id', 'article_ids_inview', 'rating']], reader)

trainset = data.build_full_trainset()

# Define the SVD algorithm
algo = SVD()

# Train the algorithm on the training set
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2110dc820>

In [34]:


df_val = pd.read_parquet('ebnerd_small/validation/behaviors.parquet')

df_val['predicted_article_id'] = df_val['article_ids_clicked'].apply(lambda ids: ids[0])
df_exploded_val = df_val.explode('article_ids_inview')

# Add a column to indicate if the article was recommended
df_exploded_val['rating'] = (df_exploded_val['predicted_article_id'] == df_exploded_val['article_ids_inview']).astype(int)

def find_mode(series):
    return mode(series).mode

mode_ratings_val = df_exploded_val.groupby(['user_id', 'article_ids_inview'])['rating'].agg(find_mode).reset_index()

# Rename columns for clarity
mode_ratings_val.columns = ['user_id', 'article_ids_inview', 'rating']

KeyboardInterrupt: 

In [24]:
mode_ratings_val2 = list(mode_ratings_val.itertuples(index=False, name=None))


In [35]:
# Predict ratings for the test set
predictions_val = algo.test(mode_ratings_val2)

# Calculate and print the accuracy (RMSE)
accuracy.rmse(predictions_val)

RMSE: 0.2654


0.26544084878215846

In [62]:
df_val = pd.read_parquet('ebnerd_small/validation/behaviors.parquet')

df_val['predicted_article_id'] = df_val['article_ids_clicked'].apply(lambda ids: ids[0])

In [63]:
predicted100p = 0
guessed_right = 0
guessed_wrong = 0
guessed1_but_wrong = 0
for i, row in tqdm(df_val.iterrows()):
    user = row["user_id"]
    inview = row["article_ids_inview"]
    actuals = row["article_ids_clicked"]
    exploded = pd.DataFrame({
    'user_id': [user] * len(inview),
    'article_ids_inview': inview,
    'fake': 0
    })
    for_pred = list(exploded.itertuples(index=False, name=None))

    predictions_val = algo.test(for_pred)
    predictions_df_ans = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions_val],
                              columns=['uid', 'iid', 'est'])
    predictions_df_ans = predictions_df_ans.sort_values(by='est', ascending=False).reset_index(drop=True)
    max_est = predictions_df_ans['est'].max()
    filtered_df = predictions_df_ans[predictions_df_ans['est'] == max_est].reset_index(drop=True)

    if len(filtered_df['iid']) == 1 and filtered_df['iid'][0] in actuals:
        predicted100p += 1
    elif len(filtered_df['iid']) == 1:
        guessed1_but_wrong += 1
        guessed_wrong += 1
    elif random.choice(filtered_df['iid']) in actuals:
        guessed_right += 1
    else:
        guessed_wrong += 1
    if i % 25000 == 0:
        print(f"Predicted 100%: {predicted100p}")
        print(f"Right: {guessed_right}")
        print(f"Wrong: {guessed_wrong}")
        print(f"All correct: {predicted100p + guessed_right}")
        print(f"Guessed only one, but it is wrong: {guessed1_but_wrong}")

print(f"Predicted 100%: {predicted100p}")
print(f"Right: {guessed_right}")
print(f"Wrong: {guessed_wrong}")
print(f"All correct: {predicted100p + guessed_right}")


123it [00:01, 162.47it/s]

Predicted 100%: 0
Right: 0
Wrong: 1
All correct: 0
Guessed only one, but it is wrong: 0


25150it [00:35, 746.51it/s]

Predicted 100%: 427
Right: 2561
Wrong: 22013
All correct: 2988
Guessed only one, but it is wrong: 7154


50207it [01:11, 706.88it/s]

Predicted 100%: 913
Right: 5067
Wrong: 44021
All correct: 5980
Guessed only one, but it is wrong: 14531


75117it [01:47, 755.44it/s]

Predicted 100%: 1365
Right: 7673
Wrong: 65963
All correct: 9038
Guessed only one, but it is wrong: 21621


100074it [02:21, 541.39it/s]

Predicted 100%: 1857
Right: 10198
Wrong: 87946
All correct: 12055
Guessed only one, but it is wrong: 28434


125091it [02:56, 725.17it/s]

Predicted 100%: 2319
Right: 12811
Wrong: 109871
All correct: 15130
Guessed only one, but it is wrong: 35613


150125it [03:32, 633.60it/s]

Predicted 100%: 2795
Right: 15429
Wrong: 131777
All correct: 18224
Guessed only one, but it is wrong: 42414


175222it [04:07, 738.08it/s]

Predicted 100%: 3285
Right: 17955
Wrong: 153761
All correct: 21240
Guessed only one, but it is wrong: 49717


200183it [04:42, 749.77it/s]

Predicted 100%: 3669
Right: 20590
Wrong: 175742
All correct: 24259
Guessed only one, but it is wrong: 56473


225173it [05:17, 721.09it/s]

Predicted 100%: 4139
Right: 23224
Wrong: 197638
All correct: 27363
Guessed only one, but it is wrong: 63410


244647it [05:43, 711.57it/s]

Predicted 100%: 4443
Right: 25237
Wrong: 214967
All correct: 29680





In [64]:
print ((predicted100p + guessed_right) / (predicted100p + guessed_right + guessed_wrong))


0.12131765359885877
