In [188]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [189]:
# Ready embeddings

embeddings_path = f'./files/parquet/google_bert_base_multilingual_cased/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'
embeddings_df = pd.read_parquet(embeddings_path)
print(f'embeddings df shape:          ',embeddings_df.shape)


embeddings df shape:           (125541, 2)


In [190]:
embeddings_df.head()

Unnamed: 0,article_id,google-bert/bert-base-multilingual-cased
0,3000022,"[-0.35060593, 0.0034366532, -0.1756858, -0.088..."
1,3000063,"[-0.0034481985, 0.22765873, -0.19700234, 0.060..."
2,3000613,"[-0.03811903, -0.030120859, -0.6928362, 0.0801..."
3,3000700,"[-0.29374197, -0.07282147, -0.0926456, -0.0540..."
4,3000840,"[0.0019190352, -0.010633812, -0.49158585, 0.17..."


In [191]:
# Unpack th lists in the column
def unpack_lists(df, column_name):
    # Create new columns by unpacking lists in the specified column
    unpacked_cols = pd.DataFrame(df[column_name].tolist(), index=df.index)
    # Rename the columns with a prefix based on the original column name
    unpacked_cols = unpacked_cols.add_prefix(column_name + '_')
    # Concatenate the unpacked columns back to the original DataFrame
    df = pd.concat([df, unpacked_cols], axis=1)
    return df

# Specify which columns to process
columns_to_process = ['google-bert/bert-base-multilingual-cased']

# Unpack lists in each specified column
for col in columns_to_process:
    embeddings_df = unpack_lists(embeddings_df, col)

embeddings_df.drop('google-bert/bert-base-multilingual-cased', axis=1, inplace=True)

In [193]:
embeddings_df.set_index('article_id',inplace=True)

In [194]:
embeddings_df.head()

Unnamed: 0_level_0,google-bert/bert-base-multilingual-cased_0,google-bert/bert-base-multilingual-cased_1,google-bert/bert-base-multilingual-cased_2,google-bert/bert-base-multilingual-cased_3,google-bert/bert-base-multilingual-cased_4,google-bert/bert-base-multilingual-cased_5,google-bert/bert-base-multilingual-cased_6,google-bert/bert-base-multilingual-cased_7,google-bert/bert-base-multilingual-cased_8,google-bert/bert-base-multilingual-cased_9,...,google-bert/bert-base-multilingual-cased_758,google-bert/bert-base-multilingual-cased_759,google-bert/bert-base-multilingual-cased_760,google-bert/bert-base-multilingual-cased_761,google-bert/bert-base-multilingual-cased_762,google-bert/bert-base-multilingual-cased_763,google-bert/bert-base-multilingual-cased_764,google-bert/bert-base-multilingual-cased_765,google-bert/bert-base-multilingual-cased_766,google-bert/bert-base-multilingual-cased_767
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000022,-0.350606,0.003437,-0.175686,-0.088904,0.036005,0.283181,-0.142266,0.065136,-0.005309,0.029205,...,-0.025447,0.048217,-0.384371,-0.170868,0.179432,0.086636,-0.086718,0.551138,-0.022239,0.001947
3000063,-0.003448,0.227659,-0.197002,0.060735,0.233433,0.057449,-0.187985,0.018404,-0.114541,0.28488,...,0.013489,-0.072608,-0.123327,-0.068688,0.135172,0.05657,-0.04301,0.087091,0.082697,-0.057015
3000613,-0.038119,-0.030121,-0.692836,0.080122,0.22101,0.33765,-0.158383,0.314184,-0.220334,0.284457,...,0.137553,0.013182,-0.082584,-0.179138,0.073336,-0.232434,-0.18157,0.648783,0.01095,0.044345
3000700,-0.293742,-0.072821,-0.092646,-0.054097,0.067578,0.117952,-0.169967,0.205277,-0.037502,0.227512,...,0.065375,-0.05441,-0.194016,-0.281653,0.253033,0.129826,-0.076136,0.247885,-0.014166,-0.042526
3000840,0.001919,-0.010634,-0.491586,0.171188,0.184841,0.22505,-0.023485,0.102213,0.003434,0.028717,...,0.072272,0.287424,-0.230716,-0.023045,0.002246,-0.002826,0.085828,0.453794,0.064482,0.024371


In [213]:
# Read interaction matrix pickle file
size = 'demo'
type_ = 'train'
fillna_value = '0'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)
print('User embedding df shape:                         ',user_matrix_df.shape)
print('Article embedding df shape:                      ',article_matrix_df.shape)

Interaction matrix df shape:                       (1590, 1114)
User embedding df shape:                          (1590, 300)
Article embedding df shape:                       (1114, 300)


In [214]:
# Convert the dataframes to numpy arrays
user_vectors = user_matrix_df.values
article_vectors = article_matrix_df.values
interaction_matrix = interaction_matrix_df.values

In [215]:
# Normalize the embeddings (optional, depending on your use case)
# scaler = StandardScaler()
# user_vectors = scaler.fit_transform(user_vectors)
# article_vectors = scaler.transform(article_vectors)

In [216]:
# # Normalize the original interaction matrix
# interaction_matrix_normalized = MinMaxScaler().fit_transform(interaction_matrix)


In [217]:
interaction_matrix_df

merge_article_ids,9754160,9768308,9771187,9771351,9772343,9772453,9772869,9773768,9774404,9774430,...,9754265,9716285,9540082,9756788,9656793,9721105,9757226,9775988,9671008,9767989
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13538,11.0,6.0,18.0,15.0,25.0,16.0,28.0,12.0,23.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58608,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1718049,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1178033,0.0,0.0,0.0,0.0,4.0,0.0,0.0,18.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
395912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [218]:
# Get the indices of the non-zero entries in the interaction matrix
user_idx, article_idx = np.where(np.logical_and(interaction_matrix != 0, interaction_matrix <= 60))
read_times = interaction_matrix[user_idx, article_idx]  # intraction_matrix ->interaction_matrix_normalized

In [219]:
# Save to 
# Create the input features by concatenating user and article vectors
X = np.hstack((user_vectors[user_idx], article_vectors[article_idx]))
y = read_times


In [220]:
map_list= [x for x in range(0,1114)]

In [221]:
column_article_list = list(interaction_matrix_df.columns)

In [222]:
mapped_dict = dict(zip(map_list, column_article_list))

In [223]:
final_list = []
counter = -1
for x in list(article_idx):
    if x in list(mapped_dict.keys()):
        final_list.append(mapped_dict[x])
    else:
        final_list.append(counter)
        counter-=1
len(final_list)

19818

In [224]:
X_mapped = pd.DataFrame(X,index=final_list)

In [225]:
X_mapped= pd.merge(X_mapped,embeddings_df,left_on= X_mapped.index,right_on=embeddings_df.index,how='left').set_index('key_0')

In [235]:
X_mapped.head()

0                                               0
google-bert/bert-base-multilingual-cased_316    0
google-bert/bert-base-multilingual-cased_315    0
google-bert/bert-base-multilingual-cased_314    0
google-bert/bert-base-multilingual-cased_313    0
                                               ..
452                                             0
451                                             0
450                                             0
458                                             0
google-bert/bert-base-multilingual-cased_767    0
Length: 1368, dtype: int64

In [227]:
X = X_mapped.values

In [228]:
y

array([11.,  6., 18., ..., 49.,  9.,  7.])

In [229]:
# Save file
numpy_file_path = f'./files/numpy/X_{type_}_{size}'

np.save(numpy_file_path,X)

In [230]:
# Save file
numpy_file_path = f'./files/numpy/y_{type_}_{size}'

np.save(numpy_file_path,y)