In [1]:
# # Anaconda prompt 
# # conda install -c conda-forge python-annoy
# -----------------------------------------------------
# ## Installs
# # pip install faiss-cpu
# # pip install annoy

In [40]:
# Import the necessary libraries
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from annoy import AnnoyIndex
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# Save and print the similarity matrix
similarity_matrix_path = "files\parquet\Ekstra_Bladet_word2vec\document_vector.parquet"
# Similarity matrix- as given by the challenge
similarity_matrix = pd.read_parquet(similarity_matrix_path)
similarity_matrix.head()

Unnamed: 0,article_id,document_vector
0,3000022,"[0.06542388, -0.047424573, 0.06384871, -0.0014..."
1,3000063,"[0.028815078, -0.00016637295, 0.055056807, 0.0..."
2,3000613,"[0.037971217, 0.03392251, 0.027297212, 0.01708..."
3,3000700,"[0.04652399, 0.0029133065, 0.06280604, -0.0051..."
4,3000840,"[0.014736942, 0.024067875, 0.0051865038, 0.041..."


In [30]:
# Shape of the article vector
similarity_matrix.shape

(125541, 2)

In [31]:
# Import the article dataset
size = 'large'
articles_path = f'./files/parquet/ebnerd_{size}/articles.parquet'
articles_df = pd.read_parquet(articles_path)
articles_df.shape

(125541, 21)

In [33]:
# So the word2vec has been trained to the large dataset
similarity_matrix.shape[0] == articles_df.shape[0]

True

In [12]:
# Initialize Annoy index
dimension = len(similarity_matrix['document_vector'][0])
annoy_index = AnnoyIndex(dimension, 'angular')

In [13]:
# Add vectors to the Annoy index
for i, vector in enumerate(similarity_matrix['document_vector']):
    annoy_index.add_item(i, vector)

In [14]:
# Build the index
annoy_index.build(10)

True

In [23]:
# Create a dictionary to map the index with the articles
id_to_index = {article_id: idx for idx, article_id in enumerate(similarity_matrix['article_id'])}

In [26]:
# Function to get top N similar articles
def get_top_n_similar_articles(annoy_index, article_id, id_to_index, n=5):
    if article_id not in id_to_index:
        raise ValueError(f"Article ID {article_id} not found in the dataset.")
    
    article_index = id_to_index[article_id]
    similar_indices = annoy_index.get_nns_by_item(article_index, n+1)[1:]  # Exclude the article itself
    return similarity_matrix.iloc[similar_indices]['article_id'].values

In [28]:
get_top_n_similar_articles(annoy_index,article_id=3000613,id_to_index=id_to_index,n=5)

array([3197030, 9532120, 9750509, 3195615, 4744395])

#### Load the history large df

In [36]:
size= "large"
type_ = "validation"

history_train_path = f'./files/parquet/ebnerd_{size}/{type_}/history.parquet'
history_df = pd.read_parquet(history_train_path)
history_df.head()

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,10033,"[2023-05-04T07:41:35.000000, 2023-05-04T07:41:...","[38.0, 100.0, 27.0, 100.0, 72.0, 100.0, 98.0, ...","[9749036, 9748976, 9747490, 9748980, 9749224, ...","[3.0, 727.0, 2.0, 55.0, 2.0, 53.0, 17.0, 16.0,..."
1,10041,"[2023-05-04T22:12:16.000000, 2023-05-05T07:51:...","[31.0, 100.0, 77.0, 100.0, 25.0, 100.0, 65.0, ...","[9750397, 9750749, 9750726, 9749184, 9751452, ...","[8.0, 20.0, 30.0, 39.0, 5.0, 65.0, 15.0, 1.0, ..."
2,10199,"[2023-05-04T10:12:30.000000, 2023-05-04T10:12:...","[23.0, 34.0, 100.0, 38.0, 26.0, 30.0, 24.0, 24...","[9749275, 9749278, 9749319, 9749869, 9750687, ...","[5.0, 2.0, 33.0, 11.0, 3.0, 4.0, 5.0, 2.0, 1.0..."
3,10220,"[2023-05-04T16:34:42.000000, 2023-05-04T16:34:...","[58.0, 31.0, 97.0, 52.0, 28.0, 53.0, 68.0, 29....","[9750161, 9749955, 9750039, 9749076, 9750209, ...","[5.0, 5.0, 38.0, 36.0, 4.0, 2.0, 1.0, 14.0, 2...."
4,10289,"[2023-05-04T16:15:50.000000, 2023-05-04T16:16:...","[70.0, nan, nan, 84.0, 54.0, 84.0, 21.0, 25.0,...","[9750064, 9750090, 9749955, 9750904, 9750687, ...","[17.0, 8.0, 142.0, 28.0, 42.0, 39.0, 15.0, 4.0..."


In [37]:
history_df.shape

(791582, 5)

In [38]:
history_df['user_id'].nunique()

791582

In [47]:
size= "large"
type_ = "validation"

behavior = f'./files/parquet/ebnerd_{size}/{type_}/behaviors.parquet'
behavior_df = pd.read_parquet(history_train_path)
behavior_df.head()

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,96429,,2023-05-28 04:31:49,19.0,,2,"[9230405, 9784793, 9784803, 9784275, 9782726, ...",[9782884],21814,False,,,,False,12,374.0,100.0
1,96434,,2023-05-28 04:53:56,13.0,,2,"[9782884, 9783800, 9784793, 9784804, 9784702]",[9782884],21822,False,,,,False,110,234.0,100.0
2,96436,,2023-05-28 04:58:04,28.0,,2,"[9230405, 9784710, 9784696, 9695098, 9784559, ...",[9784559],21822,False,,,,False,110,9.0,38.0
3,96443,,2023-05-28 04:58:42,24.0,,2,"[9781998, 9230405, 9783405, 9784662, 9769155, ...",[9784662],21822,False,,,,False,110,0.0,
4,96462,,2023-05-28 04:15:23,10.0,,2,"[9784591, 9784702, 9782884, 9783865, 9784679]",[9784591],21824,False,,,,False,219,8.0,68.0
