In [74]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
from contextlib import contextmanager
from tqdm import tqdm
tqdm.pandas()
# Load model
from tensorflow.keras.models import load_model
import tensorflow

import logging
tensorflow.get_logger().setLevel(logging.ERROR)


In [75]:
@contextmanager
def suppress_stdout_stderr():
    """
    A context manager to suppress stdout and stderr.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

In [76]:
#Load model

model = load_model('./files/models/model_20240708-215323.h5py')

# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

# Read User and article embeddings dataframes
user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)

# Read history and behavior files
behaviors_path = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behaviors_df = pd.read_parquet(behaviors_path)
print(f'Raw {size} validation behaviors df shape:          ',behaviors_df.shape)

# Load the recoammendation list
# factorized_size = 'small'
# fillnan_value = 'mean_column'
#recommendations_full_lst_file_path = f'./files/pickle/recommendations_behaviors_{val_size}_factorized_{factorized_size}_fillnan_{fil lnan_value}_new.pkl'
# with open(recommendations_full_lst_file_path, 'rb') as f:
#    recommendations_full_lst = pickle.load(f)

Interaction matrix df shape:                       (1562, 1144)
Raw demo validation behaviors df shape:           (25356, 17)


In [77]:
behaviors_df.head(2)

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,144772,,2023-05-30 14:21:34,29.0,,2,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],76658,False,,,,False,29,7.0,59.0
1,144777,,2023-05-30 14:22:11,10.0,,2,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],76658,False,,,,False,29,58.0,98.0


In [78]:
behaviors_df = behaviors_df[['user_id', 'article_ids_inview', 'article_ids_clicked']].copy()
behaviors_df.head(2)

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042]
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125]


In [79]:
article_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
9777026,-0.906362,0.184629,-0.077364,0.141804,0.126838,0.026483,0.069664,-0.248059,-0.191986,0.643667,...,0.145201,0.044429,0.076704,-0.035611,0.017898,0.030903,-0.119346,0.139691,-0.088009,0.100579
9777705,-0.140725,-0.121132,0.095686,-0.040868,-0.336481,0.222581,0.075044,-0.307448,-0.115687,-0.137366,...,0.523301,0.411797,0.485202,-0.124307,-0.487193,-0.801425,-1.817942,0.447228,-0.520895,0.084514


In [80]:
user_matrix_df.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750497,-0.13153,-0.032818,0.003344,0.079261,-0.024774,-0.028753,0.068344,0.139933,-0.084523,-0.143698,...,-1.020635,3.978416,1.838555,-0.011407,-9.585172,-10.657123,-22.858745,5.425402,-3.873599,0.602534
22779,0.053796,0.112225,-0.077414,-0.059653,-0.122209,-0.014754,0.288462,-0.104199,0.102438,0.203092,...,0.037867,0.052257,0.05674,-0.046414,0.021236,-0.004172,-0.089435,0.039701,-0.023936,0.00895


In [81]:
# Example for one row
X = np.hstack((user_matrix_df.loc[750497], article_matrix_df.loc[9777026]))
X= X.reshape((-1, 600))
model.predict(X)[0][0]

18.83693

In [82]:
# Function predict_read_time

def predict_read_time(model,article_id_list,user_id):
    counter_zero=0
    counter_pass = 0
    with suppress_stdout_stderr():
        expected_read_times = []
        for article_id in article_id_list:
            try:
                vector = np.hstack((user_matrix_df.loc[user_id], article_matrix_df.loc[article_id]))
                vector= vector.reshape((-1, 600))
                try:
                    predict= model.predict(vector)[0][0]
                except:
                    predict =0
                    counter_zero+=1
                expected_read_times.append(predict)
            except:
                counter_pass+=1
                pass
                
        return expected_read_times


def sort_zip_lists(row):
    # Zip the two lists
    zipped = list(zip(row['article_ids_inview'], row['Predicted_read_times']))
    # Sort by the second element of each tuple
    sorted_zipped = sorted(zipped, key=lambda x: x[1],reverse=True)[:9]
    return sorted_zipped

def get_final_predicted_article_ids(tuples_list):
    # Extract the first item from each tuple
    return [t[0] for t in tuples_list]

In [83]:
amount = 1000
predictions_df_top100 = behaviors_df.head(amount).copy()

In [84]:
predictions_df_top100['Predicted_read_times']= predictions_df_top100.progress_apply(lambda row: predict_read_time(model,row['article_ids_inview'],row['user_id']),axis=1)
print(f'Counter_zero {counter_zero}')
print(f'Counter_pass {counter_pass}')

100%|██████████| 1000/1000 [04:32<00:00,  3.67it/s]

Counter_zero 0
Counter_pass 0





In [85]:
predictions_df_top100['Predicted_tuples_sorted'] = predictions_df_top100.progress_apply(sort_zip_lists, axis=1)


100%|██████████| 1000/1000 [00:00<00:00, 60067.08it/s]


In [86]:
predictions_df_top100['Predicted_article_ids'] = predictions_df_top100['Predicted_tuples_sorted'].progress_apply(get_final_predicted_article_ids)

100%|██████████| 1000/1000 [00:00<00:00, 292939.24it/s]


In [87]:
predictions_df_top100.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times,Predicted_tuples_sorted,Predicted_article_ids
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],"[25.716236, 31.24826, 14.998746, 17.066832, 13...","[(9780702, 31.24826), (9788239, 25.716236), (9...","[9780702, 9788239, 9783042, 9787499, 9553264, ..."
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],"[15.385333, 16.048363, 14.998746, 14.847691, 2...","[(9788352, 20.711372), (9786217, 16.048363), (...","[9788352, 9786217, 9788125, 9788521, 9553264, ..."
2,760446,"[9279095, 9784273, 9784275, 9784506, 9784444, ...",[9782806],"[9.50034, 9.45484, 13.706255, 9.0395, 13.42578...","[(9784583, 14.046202), (9784275, 13.706255), (...","[9784583, 9784275, 9784444, 9784160, 9784406, ..."
3,760446,"[9784575, 9784607, 9784559, 9784662, 9783852, ...",[9782656],"[17.212942, 10.036463, 11.20752, 9.966087, 13....","[(9784591, 23.063957), (9784575, 17.212942), (...","[9784591, 9784575, 9783852, 9782656, 9784559, ..."
4,760446,"[9784137, 9784298, 9779370, 9782517, 9777324, ...",[9777324],"[11.579791, 12.526391, 14.266549, 17.2709, 9.9...","[(9782517, 17.2709), (9779370, 14.266549), (97...","[9782517, 9779370, 9784298, 9784137, 9784430, ..."


In [88]:
predictions_df_path = f'./files/pickle/predictions_df_{size}_{type_}_{str(amount)}.pkl'

predictions_df_top100.to_pickle(predictions_df_path)

#### Other

In [None]:
# # ----- NOT SURE IF THESE TRANSFORMATION ARE NEEDED NEITHER IF WE MAKE THE RECOMMENDATIONS IN THIS DATASET --------
# behaviors_val_df = behaviors_val_df[['user_id','article_ids_inview', 'article_ids_clicked']]
# behaviors_val_df = behaviors_val_df.explode('article_ids_clicked')

# behaviors_val_grouped_clicked = df = behaviors_val_df.groupby('user_id')['article_ids_clicked'].apply(list).reset_index()

# behaviors_val_df = behaviors_val_df.explode('article_ids_inview')

# behaviors_val_grouped_inview_df = behaviors_val_df.groupby('user_id')['article_ids_inview'].apply(list).reset_index()

# behaviors_val_df = pd.merge(behaviors_val_grouped_inview_df, behaviors_val_grouped_clicked, on='user_id', how='inner')

# behaviors_val_df['article_ids_inview_setted_lst'] = behaviors_val_df['article_ids_inview'].apply(lambda lst: list(set(lst)))

In [None]:
# print(behaviors_val_df.shape)
# behaviors_val_df.head(2)

In [90]:
# article_ids_clicked_lst = behaviors_val_df['article_ids_clicked'].tolist()
# user_ids_lst = behaviors_val_df['user_id'].tolist()

In [91]:
# counter, precisions, recalls, ndcgs, K = 0, [], [], [], 10



# for user_id, clicked_lst, recommends_lst  in tqdm(list(zip(user_ids_lst, article_ids_clicked_lst, recommendations_full_lst))):
    
#     y_true = clicked_lst
#     y_pred = recommends_lst#[:K]

#     precision = len(set(y_true).intersection(set(y_pred))) / len(y_pred) if len(y_pred) > 0 else 0
#     recall = len(set(y_true).intersection(set(y_pred))) / len(y_true) if len(y_true) > 0 else 0

#     precisions.append(precision)
#     recalls.append(recall)
#     #ndcgs.append(ndcg_score([y_true], [y_pred], k=K))

#     if precision == 0:
#         counter +=1

#     print(f"User id: {user_id}, Length: {len(recommends_lst)}, Percision: {precision}, Recall: {recall}  ")

# print({
#     'precision@K': sum(precisions) / len(precisions)
#     ,'recall@K': sum(recalls) / len(recalls)
#     #,'ndcg@K': sum(ndcgs) / len(ndcgs)
#     })