# Content-based system using prebuilt article embeddings

## Imports

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random

## Constants

In [2]:
ROOT_PATH = '../data'
ARTICLE_EMBEDDINGS_PATH = f'{ROOT_PATH}/embeddings/xlm_roberta_base.parquet'
TRAIN_HISTORY_PATH = f'{ROOT_PATH}/train/history.parquet'
VALIDATION_HISTORY_PATH = f'{ROOT_PATH}/validation/history.parquet'
TRAIN_INTERACTIONS_PATH = f'{ROOT_PATH}/train/behaviors.parquet'
VALIDATION_INTERACTIONS_PATH = f'{ROOT_PATH}/validation/behaviors.parquet'

## Data loading

In [3]:
article_embeddings_df = pd.read_parquet(ARTICLE_EMBEDDINGS_PATH)
history_df = pd.read_parquet(TRAIN_HISTORY_PATH)

In [4]:
interactions_df = pd.read_parquet(TRAIN_INTERACTIONS_PATH)

In [5]:
interactions_df

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,47727,,2023-05-21 21:35:07,20.0,,1,"[9482380, 9775183, 9744403, 9775297, 9774020, ...",[9775183],18293,False,,,,False,265,34.0,100.0
1,47731,,2023-05-21 21:32:33,13.0,,1,"[9774557, 9774516, 9775331, 9775277, 9759966]",[9759966],18293,False,,,,False,265,45.0,100.0
2,47736,,2023-05-21 21:33:32,17.0,,1,"[9759966, 9774557, 9775352, 9746360, 9772601, ...",[9774652],18293,False,,,,False,265,78.0,100.0
3,47737,,2023-05-21 21:38:17,27.0,,1,"[9774580, 9775131, 9775202, 9774789, 9774972, ...",[9775184],18293,False,,,,False,265,6.0,52.0
4,47740,,2023-05-21 21:36:02,48.0,,1,"[9774826, 9775171, 9775076, 9769624, 9775056, ...",[9774648],18293,False,,,,False,265,32.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12063885,580100982,,2023-05-18 10:27:05,9.0,,1,"[9142581, 9233208, 9345280, 9486080, 9769306, ...",[9769306],2111769,False,,,,False,22980659,12.0,36.0
12063886,580100984,,2023-05-18 10:30:39,13.0,,1,"[9440508, 9142581, 9486080, 9345280, 9720068]",[9720068],2111769,False,,,,False,22980660,30.0,100.0
12063887,580100987,,2023-05-18 10:27:28,13.0,,1,"[9695098, 9345280, 9142581, 9747757, 9735909, ...",[9735909],2111769,False,,,,False,22980659,19.0,24.0
12063888,580100996,,2023-05-18 10:08:18,35.0,,2,"[9769917, 9771126, 9345280, 9233208, 9771166, ...",[9771126],2111774,False,,,,False,71230199,14.0,22.0


In [6]:
# Create a dictionary from df2 for fast lookups
value_dict = article_embeddings_df.set_index('article_id')['FacebookAI/xlm-roberta-base'].to_dict()

# Define a function to map list of ids to list of values using the dictionary
def map_ids_to_values(id_list):
    return [value_dict[id] for id in id_list]

# Apply the function to the list_of_ids column
history_df['article_embeddings'] = history_df['article_id_fixed'].apply(map_ids_to_values)

history_df

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed,article_embeddings
0,10029,"[2023-04-28T06:16:57.000000, 2023-04-28T06:17:...","[23.0, 69.0, 27.0, nan, 47.0, 38.0, 100.0, 12....","[9735579, 9739888, 9739471, 9739864, 9738441, ...","[28.0, 24.0, 11.0, 107.0, 8.0, 7.0, 20.0, 5.0,...","[[0.108400166, 0.12893724, 0.057115708, -0.052..."
1,10033,"[2023-04-27T11:11:32.000000, 2023-04-27T11:12:...","[33.0, 41.0, 33.0, 100.0, 68.0, 38.0, 1.0, 58....","[9738139, 9738263, 9738139, 9738760, 9738777, ...","[2.0, 2.0, 718.0, 18.0, 26.0, 78.0, 3.0, 11.0,...","[[0.08282173, 0.13022842, 0.051970687, 0.00744..."
2,10034,"[2023-04-30T09:46:57.000000, 2023-04-30T09:47:...","[nan, 88.0, 27.0, nan, 23.0, 100.0, 100.0, 22....","[9742693, 9742686, 9744016, 9743818, 9744922, ...","[21.0, 103.0, 28.0, 0.0, 5.0, 34.0, 14.0, 14.0...","[[0.14536467, 0.11943545, 0.06567032, 0.031831..."
3,10041,"[2023-04-27T15:15:28.000000, 2023-04-27T15:16:...","[78.0, 41.0, 4.0, 16.0, 22.0, 32.0, 11.0, 94.0...","[9739035, 9738303, 9737243, 9739634, 9739802, ...","[12.0, 11.0, 3.0, 3.0, 4.0, 13.0, 29.0, 24.0, ...","[[0.123115465, 0.14943, 0.052068785, -0.010544..."
4,10103,"[2023-04-27T15:37:35.000000, 2023-04-27T15:38:...","[100.0, nan, 100.0, 100.0, 100.0, 28.0, 82.0, ...","[9739035, 9739164, 9741803, 9740087, 9741986, ...","[45.0, 8.0, 61.0, 72.0, 56.0, 3.0, 22.0, 16.0,...","[[0.123115465, 0.14943, 0.052068785, -0.010544..."
...,...,...,...,...,...,...
788085,1802030,"[2023-05-13T16:20:53.000000, 2023-05-13T16:21:...","[50.0, 28.0, 13.0, 29.0, 28.0, 66.0]","[9763579, 9763448, 9763398, 9763401, 9761588, ...","[18.0, 11.0, 17.0, 25.0, 13.0, 1.0]","[[0.10347509, 0.1371567, 0.05217294, 0.0224712..."
788086,2146107,"[2023-05-10T21:44:01.000000, 2023-05-10T21:46:...","[100.0, 56.0, 100.0, 100.0, 100.0]","[9758717, 9758538, 9758717, 9758074, 9758717]","[119.0, 3.0, 5.0, 33.0, 0.0]","[[0.12370166, 0.12832843, 0.076751076, -0.0143..."
788087,1200613,"[2023-04-30T05:24:43.000000, 2023-04-30T05:25:...","[70.0, 45.0, 67.0, 40.0, 51.0, 72.0, 50.0, 53....","[9742619, 9742627, 9742586, 9742625, 9741144, ...","[14.0, 17.0, 23.0, 50.0, 33.0, 39.0, 16.0, 329...","[[0.103330605, 0.13000394, 0.05845091, 0.02054..."
788088,1375724,"[2023-05-10T12:25:50.000000, 2023-05-10T12:26:...","[91.0, 100.0, 71.0, 64.0, 61.0, 41.0, 49.0, 74...","[9757876, 9757857, 9757676, 9757639, 9757746, ...","[11.0, 5.0, 4.0, 20.0, 2.0, 6.0, 2.0, 1.0, 3.0...","[[0.13436724, 0.19167073, 0.08316318, -0.01405..."


In [10]:
user_id_to_profile_dict = history_df.set_index('user_id')['article_embeddings'].to_dict()

In [44]:
#for each article each user has interacted with:
 #  create user profile containing the vectorized articles (for example history df)

def get_best_rec(row):
    article_ids_inview = row['article_ids_inview']
    article_profile_embeddings = user_id_to_profile_dict[row['user_id']]
    article_inview_embeddings = map_ids_to_values(article_ids_inview)
    best_similarity_article_id = article_ids_inview[0]
    best_similarity = 0
    
    for i, article_inview_embedding in enumerate(article_inview_embeddings):
        similarity_sum = 0
        for article_profile_embedding in article_profile_embeddings:
            similarity = cosine_similarity(article_inview_embedding.reshape(1,-1), article_profile_embedding.reshape(1,-1)).sum()
            similarity_sum = similarity_sum + similarity
        if similarity_sum > best_similarity:
            best_similarity = similarity_sum
            best_similarity_article_id = article_ids_inview[i]
    return best_similarity_article_id

In [26]:
np.arange(6).reshape(1,-1).sum()

15

In [46]:
 # for each user interaction:
  #  compare inview articles's embeddings to the embeddings of the articles they've already interacted with
#interactions_df['predict'] = interactions_df.apply(lambda row: print(row), axis=1)

interactions_df['predict'] = interactions_df.apply(get_best_rec, axis = 1)

In [9]:
def transform_interaction_to_prediction(interaction):
    prediction_index = interaction['article_ids_inview'].index(interaction['predict'])
    prediction_list = list(range(2,len(interaction['article_ids_inview'].str.len())) + 1)
    random.shuffle(prediction_list)
    prediction_list.insert(prediction_index, 1)
    list_as_str = ','.join(str(i) for i in prediction_list)
    return f'{interaction['impression_id']} [{list_as_str}]'

predictions = interactions_df['predict'].apply(transform_interaction_to_prediction)

predictions.to_csv('./test-predictions.txt', index=False, header=False, line_terminator='\n')

KeyError: 'predict'