In [40]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
from contextlib import contextmanager
from tqdm import tqdm
tqdm.pandas()
# Load model
from tensorflow.keras.models import load_model
import tensorflow
import joblib
from sklearn.model_selection import train_test_split

import logging
tensorflow.get_logger().setLevel(logging.ERROR)

@contextmanager
def suppress_stdout_stderr():
    """
    A context manager to suppress stdout and stderr.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

# Ready embeddings

# embeddings_path = f'./files/parquet/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'
embeddings_path = f'./files/parquet/Ekstra_Bladet_word2vec/document_vector.parquet'
embeddings_df = pd.read_parquet(embeddings_path)
print(f'embeddings df shape:          ',embeddings_df.shape)

# Unpack th lists in the column
def unpack_lists(df, column_name):
    # Create new columns by unpacking lists in the specified column
    unpacked_cols = pd.DataFrame(df[column_name].tolist(), index=df.index)
    # Rename the columns with a prefix based on the original column name
    unpacked_cols = unpacked_cols.add_prefix(column_name + '_')
    # Concatenate the unpacked columns back to the original DataFrame
    df = pd.concat([df, unpacked_cols], axis=1)
    return df

# Specify which columns to process
columns_to_process = [embeddings_df.columns.tolist()[1]]

# Unpack lists in each specified column
for col in columns_to_process:
    embeddings_df = unpack_lists(embeddings_df, col)

embeddings_df.drop(embeddings_df.columns.tolist()[1], axis=1, inplace=True)

embeddings_df.set_index('article_id',inplace=True)

embeddings df shape:           (125541, 2)


In [41]:
#Load model

model = load_model('./files/models/model_20240716-221043.h5py')

# # Loading the scalers
# scaler_X = joblib.load('./files/models/scaler_X.pkl')
# scaler_y = joblib.load('./files/models/scaler_y.pkl')

# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'

# Read User and article embeddings dataframes
user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)

# Read history and behavior files


behaviors_path = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behaviors_df = pd.read_parquet(behaviors_path)
print(f'Raw {size} validation behaviors df shape:          ',behaviors_df.shape)


Raw demo validation behaviors df shape:           (17749, 17)


In [42]:
behaviors_df.head(2)

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,117028734,,2023-05-26 15:18:19,5.0,,2,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361],1579040,False,,,,False,79328,59.0,100.0
1,542013976,9788823.0,2023-05-30 16:33:57,14.0,100.0,1,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661],715376,False,,,,False,187715,61.0,60.0


In [43]:
behaviors_df = behaviors_df[['user_id', 'article_ids_inview', 'article_ids_clicked']].copy()
behaviors_df.head(2)

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,1579040,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361]
1,715376,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661]


In [44]:
article_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
9749628,-3.307374,0.445233,-2.70933,-4.749975,-1.219494,3.722344,2.099001,3.437852,-1.987949,3.490878,...,-0.238463,-1.349837,-0.737692,0.424557,1.101546,0.394015,1.550014,-0.492118,-1.075111,-2.997566
9762114,-1.257639,-6.678197,-6.740248,-3.320006,0.725085,2.557267,-0.298304,2.272891,-1.395085,1.607784,...,1.240616,2.592725,-0.765813,-1.654907,3.743151,1.133336,0.857505,-1.21523,-1.500867,-3.182475


In [45]:
user_matrix_df.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2515364,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,0.07162,-0.054645,-0.059238,0.103721,0.096251,0.147019,0.087606,-0.052229,-0.077832,-0.163039
299355,-1.018642,0.405334,-0.071454,-0.722963,-0.71935,1.140239,-2.25217,-0.569453,0.770422,-0.494271,...,0.109785,-0.759528,-0.200684,-0.120576,0.058308,0.23706,-0.094378,-0.227971,-0.347991,-0.851085


In [46]:
embeddings_df.head(2)

Unnamed: 0_level_0,document_vector_0,document_vector_1,document_vector_2,document_vector_3,document_vector_4,document_vector_5,document_vector_6,document_vector_7,document_vector_8,document_vector_9,...,document_vector_290,document_vector_291,document_vector_292,document_vector_293,document_vector_294,document_vector_295,document_vector_296,document_vector_297,document_vector_298,document_vector_299
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000022,0.065424,-0.047425,0.063849,-0.001418,0.033669,-0.067176,0.026216,0.132957,0.065193,0.013772,...,0.038521,0.052188,-0.004786,0.040849,0.11221,0.027287,-0.021366,-0.117615,-0.014373,0.035706
3000063,0.028815,-0.000166,0.055057,0.043669,0.02064,-0.115931,0.035702,0.189682,0.049887,-0.008056,...,0.010698,0.040982,0.022549,-0.003128,0.08781,0.01297,0.015932,-0.043625,0.049085,0.027167


In [47]:
# Example for one row
X = np.hstack((user_matrix_df.loc[750497], article_matrix_df.loc[9777026], embeddings_df.loc[9777026]))
X= X.reshape((-1, X.shape[0]))

model.predict(X)[0][0]




28.534416

In [48]:
# Function predict_read_time

def predict_read_time(model,article_id_list,user_id, embeddings_df):
    # counter_zero=0
    # counter_pass = 0
    with suppress_stdout_stderr():
        expected_read_times = []
        for article_id in article_id_list:
            try:
                vector = np.hstack((user_matrix_df.loc[user_id], article_matrix_df.loc[article_id], embeddings_df.loc[article_id]))
                vector= vector.reshape((-1, vector.shape[0]))
                # vector = scaler_X.transform(vector)
                try:
                    predict= model.predict(vector)[0][0]
                except:
                    predict =0
                    # counter_zero+=1
                expected_read_times.append(predict)
            except:
                # counter_pass+=1
                pass
                
        return expected_read_times


def sort_zip_lists(row):
    # Zip the two lists
    zipped = list(zip(row['article_ids_inview'], row['Predicted_read_times']))
    # Sort by the second element of each tuple
    sorted_zipped = sorted(zipped, key=lambda x: x[1],reverse=True)[:9]
    return sorted_zipped

def get_final_predicted_article_ids(tuples_list):
    # Extract the first item from each tuple
    return [t[0] for t in tuples_list]

In [49]:
amount = 100
predictions_df = behaviors_df.head(amount).copy()

In [50]:
predictions_df['Predicted_read_times']= predictions_df.progress_apply(lambda row: predict_read_time(model,row['article_ids_inview'],row['user_id'], embeddings_df),axis=1)

100%|██████████| 100/100 [03:09<00:00,  1.90s/it]


In [51]:
predictions_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times
0,1579040,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361],"[28.534416, 28.534416, 28.534416, 28.534416, 2..."
1,715376,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661],"[28.534416, 28.534416, 28.534416, 28.534416, 2..."
2,1918494,"[9784696, 9783865, 9784591, 9784710, 9784702]",[9784696],"[28.534416, 28.534416, 28.534416, 28.534416, 2..."
3,1532446,"[9789702, 9789810, 9789747, 9789694, 9789711, ...",[9789702],"[28.534416, 28.534416, 28.534416, 28.534416, 2..."
4,374121,"[9789479, 9789427, 9789537, 9789481, 9780702, ...",[9789141],"[28.534416, 28.534416, 28.534416, 28.534416, 2..."


In [52]:
predictions_df['Predicted_tuples_sorted'] = predictions_df.progress_apply(sort_zip_lists, axis=1)


100%|██████████| 100/100 [00:00<00:00, 24994.36it/s]


In [53]:
predictions_df['Predicted_article_ids'] = predictions_df['Predicted_tuples_sorted'].progress_apply(get_final_predicted_article_ids)

100%|██████████| 100/100 [00:00<00:00, 49748.59it/s]


In [54]:
predictions_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times,Predicted_tuples_sorted,Predicted_article_ids
0,1579040,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361],"[28.534416, 28.534416, 28.534416, 28.534416, 2...","[(9777910, 28.534416), (9779285, 28.534416), (...","[9777910, 9779285, 9142564, 9754241, 9782319, ..."
1,715376,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661],"[28.534416, 28.534416, 28.534416, 28.534416, 2...","[(9788497, 28.534416), (9788188, 28.534416), (...","[9788497, 9788188, 9788661, 8392487, 9787499, ..."
2,1918494,"[9784696, 9783865, 9784591, 9784710, 9784702]",[9784696],"[28.534416, 28.534416, 28.534416, 28.534416, 2...","[(9784696, 28.534416), (9783865, 28.534416), (...","[9784696, 9783865, 9784591, 9784710, 9784702]"
3,1532446,"[9789702, 9789810, 9789747, 9789694, 9789711, ...",[9789702],"[28.534416, 28.534416, 28.534416, 28.534416, 2...","[(9789702, 28.534416), (9789810, 28.534416), (...","[9789702, 9789810, 9789747, 9789694, 9789711, ..."
4,374121,"[9789479, 9789427, 9789537, 9789481, 9780702, ...",[9789141],"[28.534416, 28.534416, 28.534416, 28.534416, 2...","[(9789479, 28.534416), (9789427, 28.534416), (...","[9789479, 9789427, 9789537, 9789481, 9780702, ..."


In [55]:
# Function to add relevance column
def add_relevance_column(df):
    relevances = []
    for idx, row in df.iterrows():
        clicked_article = row['article_ids_clicked'][0]
        predicted_articles = row['Predicted_article_ids']
        
        # Generate relevance scores for the current row
        row_relevances = [1 if article == clicked_article else 0 for article in predicted_articles]
        relevances.append(row_relevances)
    # Assign the relevance scores to a new column
    df['Relevance'] = relevances
    return df

In [56]:
predictions_df = add_relevance_column(predictions_df)

In [57]:
predictions_df_path = f'./files/pickle/predictions_df_{size}_{type_}_{str(amount)}.pkl'

predictions_df.to_pickle(predictions_df_path)