In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
from contextlib import contextmanager
from tqdm import tqdm
tqdm.pandas()
# Load model
from tensorflow.keras.models import load_model
import tensorflow
import joblib
from sklearn.model_selection import train_test_split
from collections import defaultdict

import logging
tensorflow.get_logger().setLevel(logging.ERROR)

@contextmanager
def suppress_stdout_stderr():
    """
    A context manager to suppress stdout and stderr.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

# Ready embeddings

# embeddings_path = f'./files/parquet/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'
embeddings_path = f'./files/parquet/Ekstra_Bladet_word2vec/document_vector.parquet'
embeddings_df = pd.read_parquet(embeddings_path)
print(f'embeddings df shape:          ',embeddings_df.shape)

# Unpack th lists in the column
def unpack_lists(df, column_name):
    # Create new columns by unpacking lists in the specified column
    unpacked_cols = pd.DataFrame(df[column_name].tolist(), index=df.index)
    # Rename the columns with a prefix based on the original column name
    unpacked_cols = unpacked_cols.add_prefix(column_name + '_')
    # Concatenate the unpacked columns back to the original DataFrame
    df = pd.concat([df, unpacked_cols], axis=1)
    return df

# Specify which columns to process
columns_to_process = [embeddings_df.columns.tolist()[1]]

# Unpack lists in each specified column
for col in columns_to_process:
    embeddings_df = unpack_lists(embeddings_df, col)

embeddings_df.drop(embeddings_df.columns.tolist()[1], axis=1, inplace=True)

embeddings_df.set_index('article_id',inplace=True)

size = 'demo'

articles_path = f'./files/parquet/ebnerd_{size}/articles.parquet'
articles_df = pd.read_parquet(articles_path)
print(f'Raw {size} validation behaviors df shape:          ',articles_df.shape)

embeddings df shape:           (125541, 2)
Raw demo validation behaviors df shape:           (11777, 21)


In [2]:
categ_counts_df = articles_df['category_str'].value_counts().reset_index()
categ_counts_df['count_perc'] = round(categ_counts_df['count']/ articles_df.shape[0], 4)

unique_categories = categ_counts_df['category_str'].unique().tolist()
print(unique_categories)

categ_counts_df.head(25)

['nyheder', 'sport', 'underholdning', 'krimi', 'side9', 'forbrug', 'auto', 'sex_og_samliv', 'nationen', 'musik', 'penge', 'ferie', 'biler', 'haandvaerkeren', 'opinionen', 'play', 'horoskoper', 'incoming', 'plus', 'om_ekstra_bladet', 'vin', 'podcast', 'bibliotek', 'dagsorden', 'services']


Unnamed: 0,category_str,count,count_perc
0,nyheder,2733,0.2321
1,sport,2146,0.1822
2,underholdning,1775,0.1507
3,krimi,1566,0.133
4,side9,869,0.0738
5,forbrug,826,0.0701
6,auto,612,0.052
7,sex_og_samliv,463,0.0393
8,nationen,286,0.0243
9,musik,181,0.0154


In [3]:
#Load model

model = load_model('./files/models/model_20240719-221352.h5py')

# # Loading the scalers
# scaler_X = joblib.load('./files/models/scaler_X.pkl')
# scaler_y = joblib.load('./files/models/scaler_y.pkl')

# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'

# Read User and article embeddings dataframes
user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)

# Read history and behavior files

behaviors_path = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behaviors_df = pd.read_parquet(behaviors_path)
print(f'Raw {size} validation behaviors df shape:          ',behaviors_df.shape)

# Read Top ten articles

top_ten_file_path = f'./files/pickle/top_10_articles.pkl'
top_ten_df = pd.read_pickle(top_ten_file_path)
print(f'top_ten_df shape:          ',                  top_ten_df.shape)

behaviors_df = behaviors_df[['user_id', 'article_ids_inview', 'article_ids_clicked']].copy()
behaviors_df.head(2)


Raw demo validation behaviors df shape:           (17749, 17)
top_ten_df shape:           (1997, 2)


Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,1579040,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361]
1,715376,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661]


In [153]:
# Example for one row
X = np.hstack((user_matrix_df.loc[299355], article_matrix_df.loc[9762114])) #, embeddings_df.loc[9749628]
X= X.reshape((-1, X.shape[0]))

model.predict(X)[0][0]




14.793225

In [14]:
# Function predict_read_time

def predict_read_time(model,article_id_list,user_id, embeddings_df):
    # counter_zero=0
    # counter_pass = 0
    with suppress_stdout_stderr():
        expected_read_times = []
        for article_id in article_id_list:
            try:
                vector = np.hstack((user_matrix_df.loc[user_id], article_matrix_df.loc[article_id])) #, embeddings_df.loc[article_id]
                vector= vector.reshape((-1, vector.shape[0]))
                # vector = scaler_X.transform(vector)
                try:
                    predict= model.predict(vector)[0][0]
                except:
                    predict =0
                    # counter_zero+=1
                expected_read_times.append(predict)
            except:
                # counter_pass+=1
                pass     
        return expected_read_times


def zip_lists(row):
    # Zip the two lists
    zipped = list(zip(row['article_ids_inview'],row['Weighted_read_times']  , row['article_categories'] ))
    return zipped


def sort_lists(row):

    sorted_zipped = sorted(row['Predicted_weighted_tuples_selected'], key=lambda x: x[1],reverse=True)
    return sorted_zipped

def sort_lists_top10(row):

    sorted_zipped = sorted(row['Predicted_weighted_tuples_selected'], key=lambda x: x[1],reverse=True)[:9]
    return sorted_zipped



def get_final_predicted_article_ids(tuples_list):
    # Extract the first item from each tuple
    return [t[0] for t in tuples_list]

# Function to add relevance column
def add_relevance_column(df, col, new_col):
    relevances = []
    for idx, row in df.iterrows():
        clicked_article = row['article_ids_clicked'][0]
        predicted_articles = row[col]
        
        # Generate relevance scores for the current row
        row_relevances = [1 if article == clicked_article else 0 for article in predicted_articles]
        relevances.append(row_relevances)
    # Assign the relevance scores to a new column
    df[new_col] = relevances
    return df

# def get_best_recommendations(data):
#     # Dictionary to hold the first occurrence of each category
#     unique_category_dict = {}

#     # Iterate through the list
#     for element in data:
#         category = element[2]
#         # If the category is not already in the dictionary, add it
#         if category not in unique_category_dict:
#             unique_category_dict[category] = element

#     # Convert the dictionary values to a list of tuples
#     unique_tuples = list(unique_category_dict.values())

#     return unique_tuples

def find_article_category(articles_df, lst):
    categ_lst = []
    for article in lst:
        categ_lst.append(articles_df[articles_df['article_id'] == article]['category_str'].values[0])
    
    return categ_lst

def find_count_percent(categ_counts_df, lst):
    perc_lst = []
    for ctgr in lst:
        perc_lst.append(categ_counts_df[categ_counts_df['category_str'] == ctgr]['count_perc'].values[0])
    
    return perc_lst

    # Function to calculate the product of corresponding elements

def calculate_product(row):
    return [a * b for a, b in zip(row['Predicted_read_times'], row['count_percentaged'])]

# Function to filter and select tuples
def select_tuples(row, count_df):
    # Extract the unique categories in the Predicted_weighted_tuples
    categories = set([t[2] for t in row['Predicted_weighted_tuples']])
    
    # Filter the count_df based on the unique categories
    filtered_count_df = count_df[count_df['category_str'].isin(categories)]
    
    # list of unique categories
    ordered_categories = filtered_count_df['category_str'].tolist()
    
    # Initialize selection order
    selection_order = [3, 2, 2, 1, 1, 1]
    
    # Initialize result
    selected_tuples = []
    
    # Create a defaultdict to store tuples by category
    tuples_by_category = defaultdict(list)
    for t in row['Predicted_weighted_tuples']:
        tuples_by_category[t[2]].append(t)

    # Sort the tuples within each category by the second element (read time)
    for category in tuples_by_category:
        tuples_by_category[category].sort(key=lambda x: x[1], reverse=True)
    
    # Select tuples according to the selection order
    for i, category in enumerate(ordered_categories):
        if i < len(selection_order):
            selected_tuples.extend(tuples_by_category[category][:selection_order[i]])
            
    return selected_tuples
    

In [100]:
zipped = list(zip(predictions_df.loc[0]['article_ids_inview'],
         predictions_df.loc[0]['Predicted_read_times'],
           predictions_df.loc[0]['article_categories'],
             predictions_df.loc[0]['count_percentaged'] ))

In [15]:
amount = 100
predictions_df = behaviors_df.head(amount).copy()


predictions_df['Predicted_read_times']= predictions_df.progress_apply(lambda row: predict_read_time(model,row['article_ids_inview'],row['user_id'], embeddings_df),axis=1)

predictions_df['article_categories'] = predictions_df['article_ids_inview'].apply(lambda x: find_article_category(articles_df, x))

predictions_df['count_percentaged'] = predictions_df['article_categories'].apply(lambda x: find_count_percent(categ_counts_df, x))

# Create a new column with the calculated product
predictions_df['Weighted_read_times'] = predictions_df.apply(calculate_product, axis=1)



predictions_df['Predicted_weighted_tuples'] = predictions_df.progress_apply(zip_lists, axis=1)

predictions_df['Predicted_weighted_tuples_selected'] = predictions_df.apply(select_tuples, axis=1, count_df=categ_counts_df)



predictions_df['Predicted_weighted_tuples_sorted'] = predictions_df.progress_apply(sort_lists, axis=1)

predictions_df['Predicted_weighted_tuples_top10_sorted'] = predictions_df.progress_apply(sort_lists_top10, axis=1)



predictions_df['Predicted_article_ids_all'] = predictions_df['Predicted_weighted_tuples_sorted'].progress_apply(get_final_predicted_article_ids)

predictions_df['Predicted_article_ids'] = predictions_df['Predicted_weighted_tuples_top10_sorted'].progress_apply(get_final_predicted_article_ids)

predictions_df = add_relevance_column(predictions_df, 'Predicted_article_ids', 'Relevance')

predictions_df = add_relevance_column(predictions_df, 'Predicted_article_ids_all', 'Relevance_all')

100%|██████████| 100/100 [02:22<00:00,  1.43s/it]
100%|██████████| 100/100 [00:00<00:00, 20035.85it/s]
100%|██████████| 100/100 [00:00<00:00, 3846.19it/s]
100%|██████████| 100/100 [00:00<00:00, 14290.64it/s]
100%|██████████| 100/100 [00:00<00:00, 49961.93it/s]
100%|██████████| 100/100 [00:00<00:00, 50099.19it/s]


In [16]:
predictions_df.head(10)

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,Predicted_read_times,article_categories,count_percentaged,Weighted_read_times,Predicted_weighted_tuples,Predicted_weighted_tuples_selected,Predicted_weighted_tuples_sorted,Predicted_weighted_tuples_top10_sorted,Predicted_article_ids_all,Predicted_article_ids,Relevance,Relevance_all
0,1579040,"[9777910, 9779285, 9142564, 9754241, 9782319, ...",[9782361],"[20.752022, 18.52861, 25.207947, 20.682022, 20...","[forbrug, underholdning, underholdning, nyhede...","[0.0701, 0.1507, 0.1507, 0.2321, 0.1822, 0.182...","[1.4547167274475097, 2.7922615615844726, 3.798...","[(9777910, 1.4547167274475097, forbrug), (9779...","[(9465878, 4.820938454055786, nyheder), (97542...","[(9465878, 4.820938454055786, nyheder), (97542...","[(9465878, 4.820938454055786, nyheder), (97542...","[9465878, 9754241, 9782319, 9142564, 9782361, ...","[9465878, 9754241, 9782319, 9142564, 9782361, ...","[0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0]"
1,715376,"[9788497, 9788188, 9788661, 8392487, 9787499, ...",[9788661],"[16.395042, 17.09298, 16.918114, 18.079475, 15...","[underholdning, forbrug, nyheder, forbrug, und...","[0.1507, 0.0701, 0.2321, 0.0701, 0.1507, 0.0243]","[2.4707328926086425, 1.1982178581237792, 3.926...","[(9788497, 2.4707328926086425, underholdning),...","[(9788661, 3.9266941917419436, nyheder), (9788...","[(9788661, 3.9266941917419436, nyheder), (9788...","[(9788661, 3.9266941917419436, nyheder), (9788...","[9788661, 9788497, 9787499, 8392487, 9788188, ...","[9788661, 9788497, 9787499, 8392487, 9788188, ...","[1, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0]"
2,1918494,"[9784696, 9783865, 9784591, 9784710, 9784702]",[9784696],"[12.138173, 12.142452, 12.098904, 11.898239, 1...","[sport, musik, sport, sport, sport]","[0.1822, 0.0154, 0.1822, 0.1822, 0.1822]","[2.211575139427185, 0.18699376449584962, 2.204...","[(9784696, 2.211575139427185, sport), (9783865...","[(9784696, 2.211575139427185, sport), (9784591...","[(9784696, 2.211575139427185, sport), (9784591...","[(9784696, 2.211575139427185, sport), (9784591...","[9784696, 9784591, 9784702, 9783865]","[9784696, 9784591, 9784702, 9783865]","[1, 0, 0, 0]","[1, 0, 0, 0]"
3,1532446,"[9789702, 9789810, 9789747, 9789694, 9789711, ...",[9789702],"[18.316267, 17.292904, 16.981846, 18.209846, 1...","[nationen, penge, nyheder, sport, krimi, under...","[0.0243, 0.0103, 0.2321, 0.1822, 0.133, 0.1507...","[0.4450852884292602, 0.1781169101715088, 3.941...","[(9789702, 0.4450852884292602, nationen), (978...","[(9789747, 3.941486423110962, nyheder), (97896...","[(9789747, 3.941486423110962, nyheder), (97896...","[(9789747, 3.941486423110962, nyheder), (97896...","[9789747, 9789694, 9788043, 9789676, 9789711, ...","[9789747, 9789694, 9788043, 9789676, 9789711, ...","[0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 1, 0]"
4,374121,"[9789479, 9789427, 9789537, 9789481, 9780702, ...",[9789141],"[12.54566, 12.502665, 12.474083, 12.576557, 12...","[penge, nyheder, krimi, nyheder, forbrug, unde...","[0.0103, 0.2321, 0.133, 0.2321, 0.0701, 0.1507...","[0.12922029819488526, 2.901868445777893, 1.659...","[(9789479, 0.12922029819488526, penge), (97894...","[(9788126, 2.934423396110535, nyheder), (97881...","[(9788126, 2.934423396110535, nyheder), (97881...","[(9788126, 2.934423396110535, nyheder), (97881...","[9788126, 9788106, 9789481, 9789539, 9788190, ...","[9788126, 9788106, 9789481, 9789539, 9788190, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,2143620,"[9786159, 9786066, 9785751, 9773779, 9785017, ...",[9786066],"[17.269964, 17.159416, 17.196747, 17.2599, 17....","[nyheder, nyheder, musik, krimi, musik, sport,...","[0.2321, 0.2321, 0.0154, 0.133, 0.0154, 0.1822...","[4.008358695030212, 3.982700499725342, 0.26482...","[(9786159, 4.008358695030212, nyheder), (97860...","[(9786159, 4.008358695030212, nyheder), (97777...","[(9786159, 4.008358695030212, nyheder), (97777...","[(9786159, 4.008358695030212, nyheder), (97777...","[9786159, 9777705, 9785923, 9781001, 9785604, ...","[9786159, 9777705, 9785923, 9781001, 9785604, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]"
6,88591,"[9784939, 9782181, 9766514, 9775978, 9785000, ...",[9778869],"[15.2559395, 15.383433, 15.390412, 15.166272, ...","[sport, krimi, nyheder, krimi, underholdning, ...","[0.1822, 0.133, 0.2321, 0.133, 0.1507, 0.1507,...","[2.7796321739196777, 2.0459966344833376, 3.572...","[(9784939, 2.7796321739196777, sport), (978218...","[(9785112, 3.574869154071808, nyheder), (97665...","[(9785112, 3.574869154071808, nyheder), (97665...","[(9785112, 3.574869154071808, nyheder), (97665...","[9785112, 9766514, 9785113, 9785145, 9785000, ...","[9785112, 9766514, 9785113, 9785145, 9785000, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,1703304,"[9784275, 9783993, 9784856, 9784863, 9784805, ...",[9779807],"[14.897745, 14.964568, 15.002345, 15.00355, 14...","[underholdning, underholdning, nyheder, krimi,...","[0.1507, 0.1507, 0.2321, 0.133, 0.2321, 0.1507...","[2.245090191459656, 2.2551604184150698, 3.4820...","[(9784275, 2.245090191459656, underholdning), ...","[(9783790, 3.4830186673164367, nyheder), (9784...","[(9783790, 3.4830186673164367, nyheder), (9784...","[(9783790, 3.4830186673164367, nyheder), (9784...","[9783790, 9784856, 9784805, 9784839, 9779807, ...","[9783790, 9784856, 9784805, 9784839, 9779807, ...","[0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0]"
8,802669,"[9778804, 9760046, 9778915, 9777492, 9781621, ...",[9781621],"[14.464118, 14.738018, 15.003226, 15.415776, 1...","[penge, underholdning, nationen, nyheder, nyhe...","[0.0103, 0.1507, 0.0243, 0.2321, 0.2321, 0.0103]","[0.14898041543960572, 2.221019318008423, 0.364...","[(9778804, 0.14898041543960572, penge), (97600...","[(9781621, 3.6624987594604494, nyheder), (9777...","[(9781621, 3.6624987594604494, nyheder), (9777...","[(9781621, 3.6624987594604494, nyheder), (9777...","[9781621, 9777492, 9760046, 9778915, 9778731]","[9781621, 9777492, 9760046, 9778915, 9778731]","[1, 0, 0, 0, 0]","[1, 0, 0, 0, 0]"
9,1632935,"[9787656, 9787553, 9787679, 9787510, 9486080, ...",[9787441],"[15.540736, 16.804512, 15.760479, 16.711687, 1...","[nyheder, nyheder, sport, nyheder, underholdni...","[0.2321, 0.2321, 0.1822, 0.2321, 0.1507, 0.150...","[3.6070048716545107, 3.9003272407531737, 2.871...","[(9787656, 3.6070048716545107, nyheder), (9787...","[(9787553, 3.9003272407531737, nyheder), (9787...","[(9787553, 3.9003272407531737, nyheder), (9787...","[(9787553, 3.9003272407531737, nyheder), (9787...","[9787553, 9787510, 9787646, 9787679, 9486080, ...","[9787553, 9787510, 9787646, 9787679, 9486080, ...","[0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]"


In [12]:
predictions_df['Predicted_weighted_tuples'][9]

[(9787656, 3.6070048716545107, 'nyheder'),
 (9787553, 3.9003272407531737, 'nyheder'),
 (9787679, 2.871559268951416, 'sport'),
 (9787510, 3.8787825731277468, 'nyheder'),
 (9486080, 2.532849847793579, 'underholdning'),
 (9786495, 2.502135432815552, 'underholdning'),
 (9787525, 2.4883367111206054, 'underholdning'),
 (9787646, 3.865040858459473, 'nyheder'),
 (9787441, 3.640672980117798, 'nyheder'),
 (9782005, 2.2951582003593445, 'underholdning'),
 (9787524, 2.226369234085083, 'krimi'),
 (9484153, 2.523007989883423, 'underholdning'),
 (9428643, 0.6244222560882569, 'sex_og_samliv'),
 (9787487, 3.800327170372009, 'nyheder')]

In [13]:
predictions_df['Predicted_weighted_tuples_selected'][9]

[(9787656, 3.6070048716545107, 'nyheder'),
 (9787553, 3.9003272407531737, 'nyheder'),
 (9787510, 3.8787825731277468, 'nyheder'),
 (9787679, 2.871559268951416, 'sport'),
 (9486080, 2.532849847793579, 'underholdning'),
 (9786495, 2.502135432815552, 'underholdning'),
 (9787524, 2.226369234085083, 'krimi'),
 (9428643, 0.6244222560882569, 'sex_og_samliv')]

In [10]:
predictions_df['Predicted_weighted_tuples_sorted'][9]

[(9787553, 3.9003272407531737, 'nyheder'),
 (9787510, 3.8787825731277468, 'nyheder'),
 (9787656, 3.6070048716545107, 'nyheder'),
 (9787679, 2.871559268951416, 'sport'),
 (9486080, 2.532849847793579, 'underholdning'),
 (9786495, 2.502135432815552, 'underholdning'),
 (9787524, 2.226369234085083, 'krimi'),
 (9428643, 0.6244222560882569, 'sex_og_samliv')]

In [8]:
predictions_df['Relevance_all'][4]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [17]:
predictions_df_path = f'./files/pickle/predictions_df_{size}_{type_}_{str(amount)}.pkl'

predictions_df.to_pickle(predictions_df_path)