In [76]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
from contextlib import contextmanager
from tqdm import tqdm
tqdm.pandas()
# Load model
from tensorflow.keras.models import load_model
import tensorflow
import joblib
from sklearn.model_selection import train_test_split

import logging
tensorflow.get_logger().setLevel(logging.ERROR)


In [42]:
@contextmanager
def suppress_stdout_stderr():
    """
    A context manager to suppress stdout and stderr.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

In [43]:
# Ready embeddings

# embeddings_path = f'./files/parquet/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'
embeddings_path = f'./files/parquet/Ekstra_Bladet_word2vec/document_vector.parquet'
embeddings_df = pd.read_parquet(embeddings_path)
print(f'embeddings df shape:          ',embeddings_df.shape)

# Unpack th lists in the column
def unpack_lists(df, column_name):
    # Create new columns by unpacking lists in the specified column
    unpacked_cols = pd.DataFrame(df[column_name].tolist(), index=df.index)
    # Rename the columns with a prefix based on the original column name
    unpacked_cols = unpacked_cols.add_prefix(column_name + '_')
    # Concatenate the unpacked columns back to the original DataFrame
    df = pd.concat([df, unpacked_cols], axis=1)
    return df

# Specify which columns to process
columns_to_process = [embeddings_df.columns.tolist()[1]]

# Unpack lists in each specified column
for col in columns_to_process:
    embeddings_df = unpack_lists(embeddings_df, col)

embeddings_df.drop(embeddings_df.columns.tolist()[1], axis=1, inplace=True)

embeddings_df.set_index('article_id',inplace=True)

embeddings df shape:           (125541, 2)


In [75]:
#Load model

model = load_model('./files/models/model_20240715-205155.h5py')

# # Loading the scalers
# scaler_X = joblib.load('./files/models/scaler_X.pkl')
# scaler_y = joblib.load('./files/models/scaler_y.pkl')

# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'
# interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
# interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
# print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

# Read User and article embeddings dataframes
user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)

# Read history and behavior files


behaviors_path = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behaviors_df = pd.read_parquet(behaviors_path)
print(f'Raw {size} validation behaviors df shape:          ',behaviors_df.shape)


Raw demo validation behaviors df shape:           (25356, 17)


In [73]:
behaviors_df.head(2)

Unnamed: 0,impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy
0,6451339,2023-06-05 15:02:49,8.0,,2,"[9796527, 7851321, 9798805, 9795150, 9531110, ...",35982,False,,,,False,388,False
1,6451363,2023-06-05 15:03:56,20.0,,2,"[9798532, 9791602, 9798975, 9791334, 9793856, ...",36012,False,,,,False,804,False


In [74]:
behaviors_df = behaviors_df[['user_id', 'article_ids_inview', 'article_ids_clicked']].copy()
behaviors_df.head(2)

KeyError: "['article_ids_clicked'] not in index"

In [47]:
article_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
9777026,-0.906362,0.184629,-0.077364,0.141804,0.126838,0.026483,0.069664,-0.248059,-0.191986,0.643667,...,0.145201,0.044429,0.076704,-0.035611,0.017898,0.030903,-0.119346,0.139691,-0.088009,0.100579
9777705,-0.140725,-0.121132,0.095686,-0.040868,-0.336481,0.222581,0.075044,-0.307448,-0.115687,-0.137366,...,0.523301,0.411797,0.485202,-0.124307,-0.487193,-0.801425,-1.817942,0.447228,-0.520895,0.084514


In [48]:
user_matrix_df.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
750497,-0.13153,-0.032818,0.003344,0.079261,-0.024774,-0.028753,0.068344,0.139933,-0.084523,-0.143698,...,-1.020635,3.978416,1.838555,-0.011407,-9.585172,-10.657123,-22.858745,5.425402,-3.873599,0.602534
22779,0.053796,0.112225,-0.077414,-0.059653,-0.122209,-0.014754,0.288462,-0.104199,0.102438,0.203092,...,0.037867,0.052257,0.05674,-0.046414,0.021236,-0.004172,-0.089435,0.039701,-0.023936,0.00895


In [49]:
embeddings_df.head(2)

Unnamed: 0_level_0,document_vector_0,document_vector_1,document_vector_2,document_vector_3,document_vector_4,document_vector_5,document_vector_6,document_vector_7,document_vector_8,document_vector_9,...,document_vector_290,document_vector_291,document_vector_292,document_vector_293,document_vector_294,document_vector_295,document_vector_296,document_vector_297,document_vector_298,document_vector_299
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000022,0.065424,-0.047425,0.063849,-0.001418,0.033669,-0.067176,0.026216,0.132957,0.065193,0.013772,...,0.038521,0.052188,-0.004786,0.040849,0.11221,0.027287,-0.021366,-0.117615,-0.014373,0.035706
3000063,0.028815,-0.000166,0.055057,0.043669,0.02064,-0.115931,0.035702,0.189682,0.049887,-0.008056,...,0.010698,0.040982,0.022549,-0.003128,0.08781,0.01297,0.015932,-0.043625,0.049085,0.027167


In [50]:
# Example for one row
X = np.hstack((user_matrix_df.loc[750497], article_matrix_df.loc[9777026], embeddings_df.loc[9777026]))
X= X.reshape((-1, X.shape[0]))

model.predict(X)[0][0]


19.62653

In [51]:
# Function predict_read_time

def predict_read_time(model,article_id_list,user_id, embeddings_df):
    # counter_zero=0
    # counter_pass = 0
    with suppress_stdout_stderr():
        expected_read_times = []
        for article_id in article_id_list:
            try:
                vector = np.hstack((user_matrix_df.loc[user_id], article_matrix_df.loc[article_id], embeddings_df.loc[article_id]))
                vector= vector.reshape((-1, vector.shape[0]))
                # vector = scaler_X.transform(vector)
                try:
                    predict= model.predict(vector)[0][0]
                except:
                    predict =0
                    # counter_zero+=1
                expected_read_times.append(predict)
            except:
                # counter_pass+=1
                pass
                
        return expected_read_times


def sort_zip_lists(row):
    # Zip the two lists
    zipped = list(zip(row['article_ids_inview'], row['Predicted_read_times']))
    # Sort by the second element of each tuple
    sorted_zipped = sorted(zipped, key=lambda x: x[1],reverse=True)[:9]
    return sorted_zipped

def get_final_predicted_article_ids(tuples_list):
    # Extract the first item from each tuple
    return [t[0] for t in tuples_list]

In [52]:
amount = 100
predictions_df = behaviors_df.head(amount).copy()

In [53]:
predictions_df['Predicted_read_times']= predictions_df.progress_apply(lambda row: predict_read_time(model,row['article_ids_inview'],row['user_id'], embeddings_df),axis=1)
# print(f'Counter_zero {counter_zero}')
# print(f'Counter_pass {counter_pass}')

100%|██████████| 100/100 [00:21<00:00,  4.76it/s]


In [54]:
predictions_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],"[24.599737, 22.211462, 20.520473, 19.059458, 1..."
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],"[16.65062, 18.525475, 20.520473, 17.501417, 18..."
2,760446,"[9279095, 9784273, 9784275, 9784506, 9784444, ...",[9782806],"[9.51793, 9.169382, 13.936178, 9.342287, 10.72..."
3,760446,"[9784575, 9784607, 9784559, 9784662, 9783852, ...",[9782656],"[14.783598, 9.520455, 17.671883, 9.62603, 11.4..."
4,760446,"[9784137, 9784298, 9779370, 9782517, 9777324, ...",[9777324],"[11.921396, 11.17131, 14.326653, 15.458048, 10..."


In [55]:
predictions_df['Predicted_tuples_sorted'] = predictions_df.progress_apply(sort_zip_lists, axis=1)


100%|██████████| 100/100 [00:00<00:00, 32765.44it/s]


In [56]:
predictions_df['Predicted_article_ids'] = predictions_df['Predicted_tuples_sorted'].progress_apply(get_final_predicted_article_ids)

100%|██████████| 100/100 [00:00<?, ?it/s]


In [57]:
predictions_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times,Predicted_tuples_sorted,Predicted_article_ids
0,76658,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],"[24.599737, 22.211462, 20.520473, 19.059458, 1...","[(9788239, 24.599737), (9780702, 22.211462), (...","[9788239, 9780702, 9553264, 9787499, 6741781, ..."
1,76658,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],"[16.65062, 18.525475, 20.520473, 17.501417, 18...","[(9553264, 20.520473), (9786217, 18.525475), (...","[9553264, 9786217, 6741781, 9788352, 9788361, ..."
2,760446,"[9279095, 9784273, 9784275, 9784506, 9784444, ...",[9782806],"[9.51793, 9.169382, 13.936178, 9.342287, 10.72...","[(9784583, 16.746685), (9784275, 13.936178), (...","[9784583, 9784275, 9784160, 9784406, 9784444, ..."
3,760446,"[9784575, 9784607, 9784559, 9784662, 9783852, ...",[9782656],"[14.783598, 9.520455, 17.671883, 9.62603, 11.4...","[(9784559, 17.671883), (9784591, 17.281261), (...","[9784559, 9784591, 9784575, 9783852, 9782656, ..."
4,760446,"[9784137, 9784298, 9779370, 9782517, 9777324, ...",[9777324],"[11.921396, 11.17131, 14.326653, 15.458048, 10...","[(9782517, 15.458048), (9779370, 14.326653), (...","[9782517, 9779370, 9784137, 9784298, 9777324, ..."


In [58]:
# Function to add relevance column
def add_relevance_column(df):
    relevances = []
    for idx, row in df.iterrows():
        clicked_article = row['article_ids_clicked'][0]
        predicted_articles = row['Predicted_article_ids']
        
        # Generate relevance scores for the current row
        row_relevances = [1 if article == clicked_article else 0 for article in predicted_articles]
        relevances.append(row_relevances)
    # Assign the relevance scores to a new column
    df['Relevance'] = relevances
    return df

In [59]:
predictions_df = add_relevance_column(predictions_df)

In [70]:
predictions_df_path = f'./files/pickle/predictions_df_{size}_{type_}_{str(amount)}.pkl'

predictions_df.to_pickle(predictions_df_path)

#### Other

In [None]:
# # ----- NOT SURE IF THESE TRANSFORMATION ARE NEEDED NEITHER IF WE MAKE THE RECOMMENDATIONS IN THIS DATASET --------
# behaviors_val_df = behaviors_val_df[['user_id','article_ids_inview', 'article_ids_clicked']]
# behaviors_val_df = behaviors_val_df.explode('article_ids_clicked')

# behaviors_val_grouped_clicked = df = behaviors_val_df.groupby('user_id')['article_ids_clicked'].apply(list).reset_index()

# behaviors_val_df = behaviors_val_df.explode('article_ids_inview')

# behaviors_val_grouped_inview_df = behaviors_val_df.groupby('user_id')['article_ids_inview'].apply(list).reset_index()

# behaviors_val_df = pd.merge(behaviors_val_grouped_inview_df, behaviors_val_grouped_clicked, on='user_id', how='inner')

# behaviors_val_df['article_ids_inview_setted_lst'] = behaviors_val_df['article_ids_inview'].apply(lambda lst: list(set(lst)))

In [None]:
# print(behaviors_val_df.shape)
# behaviors_val_df.head(2)

In [90]:
# article_ids_clicked_lst = behaviors_val_df['article_ids_clicked'].tolist()
# user_ids_lst = behaviors_val_df['user_id'].tolist()

In [91]:
# counter, precisions, recalls, ndcgs, K = 0, [], [], [], 10



# for user_id, clicked_lst, recommends_lst  in tqdm(list(zip(user_ids_lst, article_ids_clicked_lst, recommendations_full_lst))):
    
#     y_true = clicked_lst
#     y_pred = recommends_lst#[:K]

#     precision = len(set(y_true).intersection(set(y_pred))) / len(y_pred) if len(y_pred) > 0 else 0
#     recall = len(set(y_true).intersection(set(y_pred))) / len(y_true) if len(y_true) > 0 else 0

#     precisions.append(precision)
#     recalls.append(recall)
#     #ndcgs.append(ndcg_score([y_true], [y_pred], k=K))

#     if precision == 0:
#         counter +=1

#     print(f"User id: {user_id}, Length: {len(recommends_lst)}, Percision: {precision}, Recall: {recall}  ")

# print({
#     'precision@K': sum(precisions) / len(precisions)
#     ,'recall@K': sum(recalls) / len(recalls)
#     #,'ndcg@K': sum(ndcgs) / len(ndcgs)
#     })