In [1]:
import pandas as pd
import numpy as np

# Raw data
size = 'large'
articles_path = f'./files/parquet/ebnerd_{size}/articles.parquet'
articles_df = pd.read_parquet(articles_path)
print('Raw Articles df shape:                           ',articles_df.shape)

size = 'demo'
history_val_path = f'./files/parquet/ebnerd_{size}/validation/history.parquet'
history_val_df = pd.read_parquet(history_val_path)
print(f'Raw {size} validation history df shape:             ',history_val_df.shape)

behaviors_val_path = f'./files/parquet/ebnerd_{size}/validation/behaviors.parquet'
behaviors_val_df = pd.read_parquet(behaviors_val_path)
print(f'Raw {size} validation behaviors df shape:           ',behaviors_val_df.shape)

# Read interaction matrix and pre-process
size = 'demo'
interaction_matrix_path = f'./files/csv/interaction_matrix_{size}.csv'
interaction_matrix_df = pd.read_csv(interaction_matrix_path)

interaction_matrix_df.set_index('user_id', inplace=True)
interaction_matrix_df = interaction_matrix_df.replace({np.nan: 0})
print('Interaction df shape:                            ',interaction_matrix_df.shape)

# Read cosine similarity matrix pickle file
cosine_similarity_df_file_path = './files/pickle/cosine_similarity_matrix_tail_50K_x_50K.pkl'
cosine_similarity_df = pd.read_pickle(cosine_similarity_df_file_path)
print('Cosine similarity df shape:                      ',cosine_similarity_df.shape)

# Read user matrix and article matrix pickle file
user_matrix_df_file_path = './files/pickle/user_matrix.pkl'
article_matrix_df_file_path = './files/pickle/article_matrix_df.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)
print('User embedding df shape:                         ',user_matrix_df.shape)
print('Article embedding df shape:                      ',article_matrix_df.shape)



Raw Articles df shape:                            (125541, 21)
Raw demo validation history df shape:              (1562, 5)
Raw demo validation behaviors df shape:            (25356, 17)
Interaction df shape:                             (1590, 4247)
Cosine similarity df shape:                       (50000, 50000)
User embedding df shape:                          (1590, 5)
Article embedding df shape:                       (4247, 5)


In [5]:
articles_df.tail(2)

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
125539,9803560,Så slemt er det: 14.000 huse er oversvømmet,Tusindvis af huse står under vand i Kherson-re...,2023-06-29 06:49:26,False,Et område på omkring 600 kvadratkilometer står...,2023-06-08 06:25:42,,article_default,https://ekstrabladet.dk/nyheder/saa-slemt-er-d...,...,"[LOC, LOC, LOC, PROD, PER, LOC, ORG, ORG, LOC]","[International politik, Katastrofe, Større kat...",118,[],nyheder,21318.0,1237.0,67514.0,0.9927,Negative
125540,9803607,Aktion mod svindlere: Seks personer anholdt,Flere kvinder er ifølge politiet blevet svindl...,2023-06-29 06:49:26,False,Mindst otte personer er blevet anholdt og sigt...,2023-06-08 06:54:53,[9803906],article_default,https://ekstrabladet.dk/krimi/aktion-mod-svind...,...,"[PER, PER, PER, LOC, ORG, LOC, LOC, PER, PER, ...","[Kriminalitet, Bedrageri]",140,[],krimi,331057.0,79590.0,3694760.0,0.9948,Negative


In [140]:
history_val_df.head()

Unnamed: 0,user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
0,750497,"[2023-05-04T09:42:39.000000, 2023-05-04T09:43:...","[100.0, 65.0, 100.0, 100.0, 100.0, 100.0, 100....","[9749224, 9749156, 9749224, 9748948, 9748980, ...","[49.0, 5.0, 7.0, 151.0, 214.0, 199.0, 22.0, 64..."
1,22779,"[2023-05-04T07:53:42.000000, 2023-05-04T15:59:...","[52.0, 39.0, 62.0, 38.0, 74.0, 19.0, 30.0, 56....","[9749025, 9750090, 9750015, 9750161, 9745750, ...","[4.0, 16.0, 2.0, 9.0, 40.0, 7.0, 9.0, 8.0, 18...."
2,373598,"[2023-05-04T07:51:58.000000, 2023-05-04T09:59:...","[nan, nan, nan, 59.0, 33.0, 75.0, nan, nan, 76...","[9514481, 9514481, 9111040, 9750389, 9750307, ...","[0.0, 0.0, 0.0, 3.0, 9.0, 117.0, 39.0, 0.0, 8...."
3,383378,"[2023-05-04T07:27:57.000000, 2023-05-04T07:29:...","[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100...","[9747490, 9749036, 9749025, 9748792, 9748592, ...","[85.0, 18.0, 133.0, 191.0, 331.0, 56.0, 43.0, ..."
4,411733,"[2023-05-04T17:09:09.000000, 2023-05-04T17:09:...","[20.0, 14.0, 61.0, 55.0, 21.0, 81.0, 100.0, 10...","[9750081, 9750111, 9750039, 9749948, 9749729, ...","[2.0, 4.0, 6.0, 9.0, 1.0, 30.0, 37.0, 5.0, 3.0..."


In [2]:
behaviors_val_df.head(2)

Unnamed: 0,impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
0,144772,,2023-05-30 14:21:34,29.0,,2,"[9788239, 9780702, 9553264, 9787499, 6741781, ...",[9783042],76658,False,,,,False,29,7.0,59.0
1,144777,,2023-05-30 14:22:11,10.0,,2,"[9788521, 9786217, 9553264, 9788361, 9788352, ...",[9788125],76658,False,,,,False,29,58.0,98.0


In [142]:
interaction_matrix_df.head(2)

Unnamed: 0_level_0,9251369,9730301,9733713,9737535,9738292,9738334,9740021,9740161,9740174,9740356,...,9700074,9737345,6404190,9448400,9728595,9726404,8166777,9582969,9627627,9674356
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11313,199.5,210.5,4.0,549.0,66.0,52.0,91.0,25.5,437.0,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13538,0.0,12.5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
print(cosine_similarity_df.index.max())
cosine_similarity_df.tail(2)

9803607


article_id,7531908,7532054,7532065,7532165,7532210,7532217,7532313,7532384,7532390,7532511,...,9803408,9803418,9803453,9803455,9803492,9803505,9803510,9803525,9803560,9803607
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9803560,0.734602,0.790929,0.731041,0.827107,0.781072,0.733286,0.739597,0.687384,0.79246,0.737152,...,0.770039,0.738796,0.804503,0.735691,0.86321,0.711341,0.75263,0.74754,1.0,0.815973
9803607,0.889616,0.867623,0.863531,0.939573,0.872222,0.881832,0.840765,0.760522,0.896393,0.903704,...,0.828435,0.741238,0.792551,0.802894,0.767662,0.7257,0.800074,0.798586,0.815973,1.0


In [145]:
user_matrix_df

Unnamed: 0_level_0,0,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11313,0.019794,0.596934,0.000000,0.401640,0.000000
13538,0.000000,1.258250,0.067626,0.067961,0.051996
15430,0.002923,0.078138,0.000000,0.000000,0.011602
19181,0.000000,1.361679,0.000000,0.078956,0.000000
19568,0.000000,0.077383,0.146945,0.106196,0.000000
...,...,...,...,...,...
2581162,0.000000,1.333367,0.000000,0.407232,0.187185
2583035,0.000000,0.562732,0.094345,0.180542,0.000000
2583477,0.000000,0.632420,0.000000,1.040167,0.000000
2585449,0.000000,0.178322,0.083496,0.309767,0.000000


: 

In [144]:
article_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4
9251369,0.0,0.039449,0.169069,0.147101,0.0
9730301,0.152348,6.31323,0.0,0.0,0.0


In [2]:
print(behaviors_val_df.dtypes)
print(behaviors_val_df.shape)
behaviors_val_df = behaviors_val_df[['user_id','article_ids_inview', 'article_ids_clicked']]
behaviors_val_df = behaviors_val_df.explode('article_ids_clicked')


impression_id                     uint32
article_id                       float64
impression_time           datetime64[us]
read_time                        float32
scroll_percentage                float32
device_type                         int8
article_ids_inview                object
article_ids_clicked               object
user_id                           uint32
is_sso_user                         bool
gender                           float64
postcode                         float64
age                              float64
is_subscriber                       bool
session_id                        uint32
next_read_time                   float32
next_scroll_percentage           float32
dtype: object
(25356, 17)


In [3]:
behaviors_val_grouped_clicked = df = behaviors_val_df.groupby('user_id')['article_ids_clicked'].apply(list).reset_index()

In [6]:
behaviors_val_grouped_clicked.shape

(1562, 2)

In [4]:
behaviors_val_df = behaviors_val_df.explode('article_ids_inview')

In [4]:
behaviors_val_df.head(2)

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,76658,9788239,9783042
0,76658,9780702,9783042


In [11]:
9783042 in list(behaviors_val_df['article_ids_inview'][0])

True

In [5]:
behaviors_val_grouped_inview_df = behaviors_val_df.groupby('user_id')['article_ids_inview'].apply(list).reset_index()

In [7]:
behaviors_val_grouped_inview_df.shape

(1562, 2)

In [5]:
behaviors_val_grouped_inview_df

Unnamed: 0,user_id,article_ids_inview
0,19181,"[9783019, 9778732, 9783122, 9783024, 9783159, ..."
1,21271,"[9784097, 7594265, 9782407, 9785267, 9784852, ..."
2,21774,"[9780815, 9786111, 9777339, 9052390, 9785500, ..."
3,22779,"[9787465, 9787261, 9787332, 9787469, 9786495, ..."
4,22895,"[9782652, 9783024, 9782202, 9783122, 9782996, ..."
...,...,...
1557,2581162,"[9782046, 9782057, 9782133, 9781998, 9782290, ..."
1558,2583035,"[9782046, 9781998, 9782108, 9782027, 9780697, ..."
1559,2583477,"[9780928, 9771367, 9779225, 9781057, 9780267, ..."
1560,2584367,"[9785030, 9780604, 9784947, 9785471, 9785593, ..."


In [8]:
behaviors_val_df = pd.merge(behaviors_val_grouped_inview_df, behaviors_val_grouped_clicked, on='user_id', how='inner')

In [9]:
behaviors_val_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked
0,19181,"[9783019, 9778732, 9783122, 9783024, 9783159, ...","[9783019, 9770145, 9782519, 9782899, 9788947, ..."
1,21271,"[9784097, 7594265, 9782407, 9785267, 9784852, ...","[9785113, 9339920, 9786222, 9782407, 9784097, ..."
2,21774,"[9780815, 9786111, 9777339, 9052390, 9785500, ...",[9785835]
3,22779,"[9787465, 9787261, 9787332, 9787469, 9786495, ...","[9787261, 9784852, 9780702, 9785111, 9782806, ..."
4,22895,"[9782652, 9783024, 9782202, 9783122, 9782996, ...","[9782996, 9790574, 9790475, 9781013, 9780514, ..."


In [12]:
# Group by 'user_id' and aggregate 'article_ids_clicked' into lists
# behaviors_val_df = behaviors_val_df.groupby('user_id')['article_ids_inview'].apply(list).reset_index()
# behaviors_val_df.head()


In [10]:
behaviors_val_df['article_ids_inview_len'] = behaviors_val_df['article_ids_inview'].apply(lambda lst: len(lst))
behaviors_val_df['article_ids_clicked_len'] = behaviors_val_df['article_ids_clicked'].apply(lambda lst: len(lst))

In [11]:
behaviors_val_df.head()

Unnamed: 0,user_id,article_ids_inview,article_ids_clicked,article_ids_inview_len,article_ids_clicked_len
0,19181,"[9783019, 9778732, 9783122, 9783024, 9783159, ...","[9783019, 9770145, 9782519, 9782899, 9788947, ...",368,27
1,21271,"[9784097, 7594265, 9782407, 9785267, 9784852, ...","[9785113, 9339920, 9786222, 9782407, 9784097, ...",98,7
2,21774,"[9780815, 9786111, 9777339, 9052390, 9785500, ...",[9785835],20,1
3,22779,"[9787465, 9787261, 9787332, 9787469, 9786495, ...","[9787261, 9784852, 9780702, 9785111, 9782806, ...",87,9
4,22895,"[9782652, 9783024, 9782202, 9783122, 9782996, ...","[9782996, 9790574, 9790475, 9781013, 9780514, ...",356,45


In [5]:
article_id = 9788239
user_id = 76658

In [33]:
articles_df.head()

Unnamed: 0,article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,...,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
0,3000022,Hanks beskyldt for mishandling,Tom Hanks har angiveligt mishandlet sin afdøde...,2023-06-29 06:20:32,False,Tom Hanks skulle angiveligt have mishandlet si...,2006-09-20 09:24:18,[3518381],article_default,https://ekstrabladet.dk/underholdning/udlandke...,...,[PER],"[Kriminalitet, Kendt, Underholdning, Personfar...",414,[432],underholdning,,,,0.9911,Negative
1,3000063,Bostrups aske spredt i Furesøen,Studieværten blev mindet med glad festlighed,2023-06-29 06:20:32,False,Strålende sensommersol. Jazzede toner. Glas me...,2006-09-24 07:45:30,"[3170935, 3170939]",article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Kendt, Underholdning, Begivenhed, Personlig b...",118,[133],nyheder,,,,0.5155,Neutral
2,3000613,Jesper Olsen ramt af hjerneblødning,Den tidligere danske landsholdsspiller i fodbo...,2023-06-29 06:20:33,False,"Jesper Olsen, der er noteret for 43 kampe på d...",2006-05-09 11:29:00,[3164998],article_default,https://ekstrabladet.dk/sport/fodbold/landshol...,...,"[LOC, PER, PER, PER]","[Kendt, Sport, Fodbold, Sundhed, Sygdom og beh...",142,"[196, 271]",sport,,,,0.9876,Negative
3,3000700,Madonna topløs med heste,47-årige Madonna poserer både topløs og sammen...,2023-06-29 06:20:33,False,Skal du have stillet Madonna-sulten inden konc...,2006-05-04 11:03:12,[3172046],article_default,https://ekstrabladet.dk/underholdning/udlandke...,...,[],"[Kendt, Livsstil, Underholdning]",414,[432],underholdning,,,,0.8786,Neutral
4,3000840,Otto Brandenburg er død,Sangeren og skuespilleren Otto Brandenburg er ...,2023-06-29 06:20:33,False,"'Og lidt for Susanne, Birgitte og Hanne... ' '...",2007-03-01 18:34:00,[3914446],article_default,https://ekstrabladet.dk/nyheder/samfund/articl...,...,[],"[Kendt, Underholdning, Begivenhed, Personlig b...",118,[133],nyheder,,,,0.9468,Negative


In [44]:
cosine_similarity_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,69990,69991,69992,69993,69994,69995,69996,69997,69998,69999
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7531908,1.0,0.881755,0.97216,0.872621,0.906384,0.899601,0.896997,0.8456,0.923239,0.872045,...,0.865316,0.706441,0.72496,0.857126,0.737717,0.717978,0.736446,0.859454,0.734602,0.889616
7532054,0.881755,1.0,0.864959,0.844612,0.914275,0.82118,0.918011,0.80327,0.892828,0.791756,...,0.89384,0.757577,0.762146,0.865063,0.767973,0.762411,0.859835,0.840881,0.790929,0.867623
7532065,0.97216,0.864959,1.0,0.848127,0.891672,0.853176,0.884038,0.863573,0.899867,0.817053,...,0.885561,0.673205,0.723935,0.871225,0.744282,0.688193,0.69344,0.856932,0.731041,0.863531
7532165,0.872621,0.844612,0.848127,1.0,0.869187,0.867203,0.817073,0.767099,0.902785,0.850833,...,0.82066,0.747894,0.813021,0.808815,0.799907,0.73956,0.761292,0.831285,0.827107,0.939573
7532210,0.906384,0.914275,0.891672,0.869187,1.0,0.855112,0.907455,0.853752,0.927329,0.824316,...,0.896679,0.748645,0.765581,0.901941,0.772688,0.766023,0.798218,0.855899,0.781072,0.872222


: 

In [35]:
# Function to get recommendations based on content similarity
# def get_recommendations(article_id, cosine_similarity_df=cosine_similarity_df):
#     #idx = articles_df[articles_df['article_id'] == article_id].index[0]
#     sim_scores = list(enumerate(cosine_similarity_df.loc[article_id, :]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:11]  # Top 10 recommendations
#     article_indices = [i[0] for i in sim_scores]
#     return articles_df['article_id'].iloc[article_indices]

def get_recommendations(article_id, cosine_similarity_df=cosine_similarity_df):
    sim_scores = cosine_similarity_df.loc[article_id, :].sort_values(ascending=False)[1:11]
    article_indices = sim_scores.index.tolist()
    # articles_df['article_id'].iloc[article_indices]
    return article_indices

# Example usage
print(get_recommendations(article_id))

[8604023, 8598011, 8595471, 9651983, 8155193, 9661568, 9681778, 9655223, 8126780, 8052997]


In [36]:
# Combine content-based and collaborative filtering
def hybrid_recommendations(user_id, article_id, user_matrix_df, article_matrix_df, cosine_similarity_df):
    # Get content-based recommendations
    content_recs = get_recommendations(article_id, cosine_similarity_df)

    # Get collaborative filtering recommendations
    #user_idx = interaction_matrix_df.index.get_loc(user_id)
    user_pref = user_matrix_df.loc[user_id, :]
    cf_interaction_scores = user_pref.dot(article_matrix_df.values.T)
    cf_recs = interaction_matrix_df.columns[cf_interaction_scores.argsort()[::-1]]

    cf_recs = [int(id) for id in list(cf_recs) ]

    # Combine recommendations
    combined_recs = list(content_recs) + list(cf_recs)
    return combined_recs[:20]

# Example usage (We pass User_id and article_id that has in view in order to see if we will click it)
combined_recommendations =  hybrid_recommendations(user_id, article_id, user_matrix_df, article_matrix_df, cosine_similarity_df)

In [37]:
combined_recommendations

[8604023,
 8598011,
 8595471,
 9651983,
 8155193,
 9661568,
 9681778,
 9655223,
 8126780,
 8052997,
 9765410,
 9725978,
 9766042,
 9737243,
 9762377,
 9764361,
 9748576,
 9759355,
 9741144,
 9761862]

In [38]:
9783042 in combined_recommendations

False

In [None]:
article_ids_inview

In [None]:
from sklearn.metrics import ndcg_score

# Function to evaluate recommendations
def evaluate_recommendations(validation_df, hybrid_recommendations, K=10):
    precisions, recalls, ndcgs = [], [], []
    
    for _, row in validation_df.iterrows():
        user_id = row['user_id']
        true_article_id = row['article_id']
        recommendations = hybrid_recommendations(user_id, true_article_id, user_matrix_df, article_matrix_df, cosine_similarity_df)
        
        y_true = [true_article_id]
        y_pred = recommendations[:K]
        
        precision = len(set(y_true).intersection(set(y_pred))) / len(y_pred) if len(y_pred) > 0 else 0
        recall = len(set(y_true).intersection(set(y_pred))) / len(y_true) if len(y_true) > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg_score([y_true], [y_pred], k=K))
    
    return {
        'precision@K': sum(precisions) / len(precisions),
        'recall@K': sum(recalls) / len(recalls),
        'ndcg@K': sum(ndcgs) / len(ndcgs)
    }

# Example evaluation
metrics = evaluate_recommendations(validation_df, hybrid_recommendations)
print(metrics)
