In [15]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
from contextlib import contextmanager
from tqdm import tqdm
tqdm.pandas()
# Load model
from tensorflow.keras.models import load_model
import tensorflow

import logging
tensorflow.get_logger().setLevel(logging.ERROR)


In [16]:
@contextmanager
def suppress_stdout_stderr():
    """
    A context manager to suppress stdout and stderr.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

In [17]:
#Load model


model = load_model('./files/models/model_20240706-115948.h5py')

# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

# Read User and article embeddings dataframes
user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)

# Read history and behavior files
behaviors_path = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behaviors_df = pd.read_parquet(behaviors_path)
print(f'Raw {size} validation behaviors df shape:          ',behaviors_df.shape)

# Load the recoammendation list
# factorized_size = 'small'
# fillnan_value = 'mean_column'
#recommendations_full_lst_file_path = f'./files/pickle/recommendations_behaviors_{val_size}_factorized_{factorized_size}_fillnan_{fil lnan_value}_new.pkl'
# with open(recommendations_full_lst_file_path, 'rb') as f:
#    recommendations_full_lst = pickle.load(f)

Interaction matrix df shape:                       (1562, 1144)
Raw demo validation behaviors df shape:           (25356, 17)


In [18]:
behaviors_df.head(2)

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,144772,,2023-05-30 14:21:34,29.0,,2,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],76658,False,,,,False,29,7.0,59.0
1,144777,,2023-05-30 14:22:11,10.0,,2,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],76658,False,,,,False,29,58.0,98.0


In [19]:
behaviors_df = behaviors_df[['user_id', 'article_ids_inview', 'article_ids_clicked']].copy()
behaviors_df.head(2)

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042]
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125]


In [20]:
article_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
6741781,-0.127794,-0.147906,-0.10521,0.153051,-0.03779,-0.086721,-0.058904,-0.164646,0.055883,-0.075121,...,-0.001194,0.003516,0.003031,-0.002613,0.000239,0.004105,-0.009573,0.00366,0.002319,0.000517
9052390,-0.715679,-0.178794,0.088663,0.388322,0.321916,-0.274348,0.212475,-0.957596,-0.539658,-0.727945,...,-0.71121,0.158471,0.344084,-0.163557,-0.014942,0.244927,-0.568294,0.214493,0.286194,0.128811


In [21]:
user_matrix_df.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19181,-0.780059,-2.054111,-1.69678,0.746559,0.015565,-0.043905,-0.717221,-1.531689,-0.24802,-1.501236,...,-0.048026,0.095239,0.101396,-0.059632,0.249939,0.147139,-0.229079,0.128577,0.157083,0.034539
21271,-0.115381,0.06531,0.226244,-0.07212,0.069986,-0.060279,0.165075,-0.007987,-0.086834,-0.079301,...,0.092411,0.124033,0.111425,-0.165869,0.050325,-0.008703,-0.104547,0.033624,0.190335,0.014424


In [22]:
# Example for one row
X = np.hstack((user_matrix_df.loc[19181], article_matrix_df.loc[6741781]))
X= X.reshape((-1, 600))
model.predict(X)[0][0]

16.257328

In [23]:
# Function predict_read_time
def predict_read_time(model,article_id_list,user_id):
    with suppress_stdout_stderr():
        expected_read_times = []
        for article_id in article_id_list:
            try:
                vector = np.hstack((user_matrix_df.loc[user_id], article_matrix_df.loc[article_id]))
                vector= vector.reshape((-1, 600))
                try:
                    predict= model.predict(vector)[0][0]
                except:
                    predict =0
                expected_read_times.append(predict)
            except:
                pass
        return expected_read_times


def sort_zip_lists(row):
    # Zip the two lists
    zipped = list(zip(row['article_ids_inview'], row['Predicted_read_times']))
    # Sort by the second element of each tuple
    sorted_zipped = sorted(zipped, key=lambda x: x[1],reverse=True)[:9]
    return sorted_zipped

def get_final_predicted_article_ids(tuples_list):
    # Extract the first item from each tuple
    return [t[0] for t in tuples_list]

In [24]:
amount = 1000
predictions_df_top100 = behaviors_df.head(amount).copy()

In [25]:
predictions_df_top100['Predicted_read_times']= predictions_df_top100.progress_apply(lambda row: predict_read_time(model,row['article_ids_inview'],row['user_id']),axis=1)

100%|██████████| 1000/1000 [04:08<00:00,  4.03it/s]


In [12]:
predictions_df_top100['Predicted_tuples_sorted'] = predictions_df_top100.progress_apply(sort_zip_lists, axis=1)


100%|██████████| 100/100 [00:00<00:00, 49286.77it/s]


In [13]:
predictions_df_top100['Predicted_article_ids'] = predictions_df_top100['Predicted_tuples_sorted'].progress_apply(get_final_predicted_article_ids)

100%|██████████| 100/100 [00:00<00:00, 99414.65it/s]


In [26]:
predictions_df_top100.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],"[23.87838, 21.764265, 20.772623, 24.495409, 11..."
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],"[14.02666, 13.230712, 20.772623, 12.180795, 18..."
2,760446,"[9279095, 9784273, 9784275, 9784506, 9784444, ...",[9782806],"[16.768415, 16.131424, 21.56328, 20.98611, 21...."
3,760446,"[9784575, 9784607, 9784559, 9784662, 9783852, ...",[9782656],"[23.246275, 16.398487, 21.16556, 22.175184, 29..."
4,760446,"[9784137, 9784298, 9779370, 9782517, 9777324, ...",[9777324],"[14.078645, 21.498259, 18.818512, 26.8837, 17...."


In [14]:
predictions_df_path = f'./files/pickle/predictions_df_{size}_{type_}_{str(amount)}.pkl'

predictions_df_top100.to_pickle(predictions_df_path)

#### Other

In [None]:
# # ----- NOT SURE IF THESE TRANSFORMATION ARE NEEDED NEITHER IF WE MAKE THE RECOMMENDATIONS IN THIS DATASET --------
# behaviors_val_df = behaviors_val_df[['user_id','article_ids_inview', 'article_ids_clicked']]
# behaviors_val_df = behaviors_val_df.explode('article_ids_clicked')

# behaviors_val_grouped_clicked = df = behaviors_val_df.groupby('user_id')['article_ids_clicked'].apply(list).reset_index()

# behaviors_val_df = behaviors_val_df.explode('article_ids_inview')

# behaviors_val_grouped_inview_df = behaviors_val_df.groupby('user_id')['article_ids_inview'].apply(list).reset_index()

# behaviors_val_df = pd.merge(behaviors_val_grouped_inview_df, behaviors_val_grouped_clicked, on='user_id', how='inner')

# behaviors_val_df['article_ids_inview_setted_lst'] = behaviors_val_df['article_ids_inview'].apply(lambda lst: list(set(lst)))

In [None]:
# print(behaviors_val_df.shape)
# behaviors_val_df.head(2)

In [90]:
# article_ids_clicked_lst = behaviors_val_df['article_ids_clicked'].tolist()
# user_ids_lst = behaviors_val_df['user_id'].tolist()

In [91]:
# counter, precisions, recalls, ndcgs, K = 0, [], [], [], 10



# for user_id, clicked_lst, recommends_lst  in tqdm(list(zip(user_ids_lst, article_ids_clicked_lst, recommendations_full_lst))):
    
#     y_true = clicked_lst
#     y_pred = recommends_lst#[:K]

#     precision = len(set(y_true).intersection(set(y_pred))) / len(y_pred) if len(y_pred) > 0 else 0
#     recall = len(set(y_true).intersection(set(y_pred))) / len(y_true) if len(y_true) > 0 else 0

#     precisions.append(precision)
#     recalls.append(recall)
#     #ndcgs.append(ndcg_score([y_true], [y_pred], k=K))

#     if precision == 0:
#         counter +=1

#     print(f"User id: {user_id}, Length: {len(recommends_lst)}, Percision: {precision}, Recall: {recall}  ")

# print({
#     'precision@K': sum(precisions) / len(precisions)
#     ,'recall@K': sum(recalls) / len(recalls)
#     #,'ndcg@K': sum(ndcgs) / len(ndcgs)
#     })