In [56]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

from scipy.sparse.linalg import svds

#set the max columns to none
pd.set_option('display.max_columns', None)

# Read interaction matrix pickle file
type_ = 'validation'
fillna_value = "0"
size = 'demo'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix = pd.read_pickle(interaction_matrix_file_path)
print(interaction_matrix.shape)
interaction_matrix.head(2)

# Perform Singular Value Decomposition (SVD) on the user-article interaction matrix
k = 50  # Number of latent factors (embedding dimensions)

U, sigma, Vt = svds(interaction_matrix.values, k=k)  # Perform SVD, obtaining U, sigma, Vt

# Diagonalize the singular values sigma to create a diagonal matrix
sigma_diag = np.diag(sigma)

# Reconstruct the user and article embeddings
user_matrix = np.dot(U, np.sqrt(sigma_diag))  # User embeddings
article_matrix = np.dot(np.sqrt(sigma_diag), Vt).T  # Article embeddings

column_lst = interaction_matrix.columns.tolist()

user_matrix_df = pd.DataFrame(user_matrix, index=interaction_matrix.index)
article_matrix_df = pd.DataFrame(article_matrix, index=column_lst)

(1539, 4545)


In [57]:
print(user_matrix_df.shape)
user_matrix_df.head(2)


(1539, 50)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
2515364,0.078083,0.303139,0.360198,-0.159283,-0.102681,-0.177813,0.129037,0.118205,-0.134128,-0.276343,-0.048539,-0.013063,0.099632,-0.088525,0.005882,-0.111702,0.172786,0.042509,0.091557,-0.190495,0.080523,0.203802,0.150438,-0.073724,-0.126559,-0.145951,0.016084,0.057727,-0.237271,0.201764,-0.026099,-0.11049,0.138487,-0.040126,-0.07074,0.022739,-0.093328,-0.081742,0.035636,0.057784,0.07162,0.054645,-0.059238,-0.103721,0.096251,-0.147019,0.087606,-0.052229,-0.077832,0.163039
299355,0.071514,0.182252,0.589615,-0.255389,-0.008872,-0.222047,-0.107488,-0.589282,-0.673808,-0.164299,-0.143603,-0.435192,0.024784,0.243902,0.287155,-0.214721,-0.830101,0.33123,-0.503252,0.515276,0.400358,-0.378188,-0.813409,-0.662007,0.3393,-1.085425,-0.244913,0.441935,-0.977319,1.109388,0.388704,0.485875,0.000201,-0.507576,-0.991243,0.091487,-0.195436,-0.275743,-0.20878,0.295253,0.109785,0.759528,-0.200684,0.120576,0.058308,-0.23706,-0.094378,-0.227971,-0.347991,0.851085


In [58]:
print(article_matrix_df.shape)
article_matrix_df.head(2)

(4545, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
9749628,-0.20274,1.498147,2.197785,-2.085243,1.060158,-0.43437,1.077135,2.769281,-0.657267,0.046774,1.675107,0.235987,-2.348584,-1.242258,2.318159,-0.813896,-1.123786,0.588756,0.264064,0.739388,-1.117067,-0.705552,1.189804,-0.913792,-0.727414,-0.652618,0.120658,-0.402072,-1.48822,2.336479,-1.884634,3.834847,2.895321,-1.307226,-4.938019,-0.064685,-1.929311,-0.786801,-0.243466,-0.006377,-0.238463,1.349837,-0.737692,-0.424557,1.101546,-0.394015,1.550014,-0.492118,-1.075111,2.997566
9762114,1.265062,1.941634,1.173853,4.749791,-3.419044,-3.507529,-0.917483,-0.744231,3.138255,5.814579,-1.684977,-1.607617,-2.916705,-1.548657,2.155501,1.86132,-2.48272,-0.955855,4.446992,2.065097,-3.413934,-0.272205,3.24369,3.31576,-1.053606,2.72596,0.011151,-0.430234,-0.439677,0.534376,-2.076063,-0.837534,3.618293,0.510351,-0.004436,-0.895177,-2.34131,0.358717,1.545212,-1.103431,1.240616,-2.592725,-0.765813,1.654907,3.743151,-1.133336,0.857505,-1.21523,-1.500867,3.182475


In [54]:
interaction_matrix.shape

(1590, 5073)

In [59]:
# Save the cosine similarity matrix to pickle file cause is very big

user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df.to_pickle(user_matrix_df_file_path)
article_matrix_df.to_pickle(article_matrix_df_file_path)