In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [22]:
# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)
print('User embedding df shape:                         ',user_matrix_df.shape)
print('Article embedding df shape:                      ',article_matrix_df.shape)

Interaction matrix df shape:                       (1562, 4074)
User embedding df shape:                          (1562, 300)
Article embedding df shape:                       (4074, 300)


In [23]:
# Convert the dataframes to numpy arrays
user_vectors = user_matrix_df.values
article_vectors = article_matrix_df.values
interaction_matrix = interaction_matrix_df.values

In [24]:
# Normalize the embeddings (optional, depending on your use case)
scaler = StandardScaler()
user_vectors = scaler.fit_transform(user_vectors)
article_vectors = scaler.transform(article_vectors)

In [25]:
# Normalize the original interaction matrix
interaction_matrix_normalized = MinMaxScaler().fit_transform(interaction_matrix)


In [26]:
# Get the indices of the non-zero entries in the interaction matrix
user_idx, article_idx = np.where(interaction_matrix_normalized != 0)
read_times = interaction_matrix_normalized[user_idx, article_idx]

In [27]:

# Save to 
# Create the input features by concatenating user and article vectors
X = np.hstack((user_vectors[user_idx], article_vectors[article_idx]))
y = read_times


In [28]:
y

array([0.01532567, 0.00837629, 0.03861625, ..., 0.04746209, 0.00391134,
       0.0862069 ], dtype=float32)

In [29]:
X

array([[ 0.06792422,  0.4406689 , -0.13783272, ..., -0.38576984,
        -0.43765825, -0.02549115],
       [ 0.06792422,  0.4406689 , -0.13783272, ...,  0.03963336,
         0.48317254, -0.03066091],
       [ 0.06792422,  0.4406689 , -0.13783272, ...,  0.10853262,
         0.61233854, -0.03115961],
       ...,
       [ 0.07120946, -0.07871223, -0.0460769 , ..., -0.28479627,
        -0.19969712, -0.02624444],
       [ 0.07120946, -0.07871223, -0.0460769 , ...,  0.06475195,
         0.48334578, -0.024862  ],
       [ 0.07120946, -0.07871223, -0.0460769 , ...,  0.11443526,
         0.5861804 , -0.03105986]], dtype=float32)

In [30]:
# Save file
numpy_file_path = f'./files/numpy/X_{type_}_{size}'

np.save(numpy_file_path,X)

In [31]:
# Save file
numpy_file_path = f'./files/numpy/y_{type_}_{size}'

np.save(numpy_file_path,y)