In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# Ready embeddings

# embeddings_path = f'./files/parquet/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'
embeddings_path = f'./files/parquet/Ekstra_Bladet_word2vec/document_vector.parquet'
embeddings_df = pd.read_parquet(embeddings_path)
print(f'embeddings df shape:          ',embeddings_df.shape)

# Unpack th lists in the column
def unpack_lists(df, column_name):
    # Create new columns by unpacking lists in the specified column
    unpacked_cols = pd.DataFrame(df[column_name].tolist(), index=df.index)
    # Rename the columns with a prefix based on the original column name
    unpacked_cols = unpacked_cols.add_prefix(column_name + '_')
    # Concatenate the unpacked columns back to the original DataFrame
    df = pd.concat([df, unpacked_cols], axis=1)
    return df

# Specify which columns to process
columns_to_process = [embeddings_df.columns.tolist()[1]]

# Unpack lists in each specified column
for col in columns_to_process:
    embeddings_df = unpack_lists(embeddings_df, col)

embeddings_df.drop(embeddings_df.columns.tolist()[1], axis=1, inplace=True)

embeddings_df.set_index('article_id',inplace=True)


embeddings df shape:           (125541, 2)


In [3]:
embeddings_df.head()

Unnamed: 0_level_0,document_vector_0,document_vector_1,document_vector_2,document_vector_3,document_vector_4,document_vector_5,document_vector_6,document_vector_7,document_vector_8,document_vector_9,...,document_vector_290,document_vector_291,document_vector_292,document_vector_293,document_vector_294,document_vector_295,document_vector_296,document_vector_297,document_vector_298,document_vector_299
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3000022,0.065424,-0.047425,0.063849,-0.001418,0.033669,-0.067176,0.026216,0.132957,0.065193,0.013772,...,0.038521,0.052188,-0.004786,0.040849,0.11221,0.027287,-0.021366,-0.117615,-0.014373,0.035706
3000063,0.028815,-0.000166,0.055057,0.043669,0.02064,-0.115931,0.035702,0.189682,0.049887,-0.008056,...,0.010698,0.040982,0.022549,-0.003128,0.08781,0.01297,0.015932,-0.043625,0.049085,0.027167
3000613,0.037971,0.033923,0.027297,0.017081,0.07016,-0.094198,0.046922,0.135971,0.03744,-0.031914,...,-5e-06,0.033847,0.017205,-0.021398,0.055585,-0.002338,-0.021583,-0.051525,0.036801,0.063961
3000700,0.046524,0.002913,0.062806,-0.005195,0.049315,-0.118057,0.053026,0.14237,0.047986,-0.025729,...,0.008539,0.075872,0.002314,0.023123,0.097166,0.013335,-0.016015,-0.033069,0.030009,0.023423
3000840,0.014737,0.024068,0.005187,0.041011,0.004975,-0.114962,-0.003801,0.215196,0.026104,-0.014294,...,0.051187,0.034234,0.011379,-0.058659,0.122571,0.021073,0.051048,-0.037618,0.046736,0.045991


In [16]:
# Read interaction matrix pickle file
size = 'demo'
type_ = 'validation'
fillna_value = '0'
interaction_matrix_file_path = f'./files/pickle/interaction_matrix_{type_}_{size}_{fillna_value}.pkl'
interaction_matrix_df = pd.read_pickle(interaction_matrix_file_path)
print('Interaction matrix df shape:                      ',interaction_matrix_df.shape)

user_matrix_df_file_path = f'./files/pickle/user_matrix_{type_}_{size}_{fillna_value}.pkl'
article_matrix_df_file_path = f'./files/pickle/article_matrix_{type_}_{size}_{fillna_value}.pkl'

user_matrix_df = pd.read_pickle(user_matrix_df_file_path)
article_matrix_df = pd.read_pickle(article_matrix_df_file_path)
print('User embedding df shape:                         ',user_matrix_df.shape)
print('Article embedding df shape:                      ',article_matrix_df.shape)

Interaction matrix df shape:                       (1539, 4545)
User embedding df shape:                          (1539, 300)
Article embedding df shape:                       (4545, 300)


In [17]:
interaction_matrix_df.head(3)

merged_article_ids,9749628,9762114,9763559,9764444,9764608,9764759,9765753,9765839,9765894,9765965,...,9781017,9791165,9623549,9673979,9494338,9523645,9791157,4721905,9436758,9526715
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2515364,102.0,13.0,52.0,23.0,27.0,87.0,150.0,6.0,53.0,293.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
551337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
user_matrix_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2515364,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,0.07162,-0.054645,-0.059238,0.103721,0.096251,0.147019,0.087606,-0.052229,-0.077832,-0.163039
299355,-1.018642,0.405334,-0.071454,-0.722963,-0.71935,1.140239,-2.25217,-0.569453,0.770422,-0.494271,...,0.109785,-0.759528,-0.200684,-0.120576,0.058308,0.23706,-0.094378,-0.227971,-0.347991,-0.851085
551337,0.037726,-0.122894,0.119298,-0.006571,-0.200531,-0.104649,0.249042,-0.259959,-0.145718,0.188788,...,-0.020023,-0.010143,-0.063857,-0.093313,-0.122832,0.179366,0.036103,-0.055746,-0.030446,-0.187345
1520051,0.225453,-0.185292,-0.220285,-0.237416,0.025387,0.001439,-0.109223,-0.387824,-0.032028,-0.176484,...,0.031164,-0.040664,-0.027413,0.123045,0.077083,0.100145,0.068078,-0.048027,-0.071068,-0.154989
251987,-0.572783,1.432055,2.221251,0.391055,-2.105272,-0.69367,-1.692668,-0.856396,0.793653,0.093899,...,0.37718,0.956866,-0.553277,0.041355,1.190839,0.533598,1.449188,-0.919891,-1.311291,-2.276968


In [19]:
article_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
9749628,-3.307374,0.445233,-2.70933,-4.749975,-1.219494,3.722344,2.099001,3.437852,-1.987949,3.490878,...,-0.238463,-1.349837,-0.737692,0.424557,1.101546,0.394015,1.550014,-0.492118,-1.075111,-2.997566
9762114,-1.257639,-6.678197,-6.740248,-3.320006,0.725085,2.557267,-0.298304,2.272891,-1.395085,1.607784,...,1.240616,2.592725,-0.765813,-1.654907,3.743151,1.133336,0.857505,-1.21523,-1.500867,-3.182475
9763559,1.816733,0.484091,0.750125,-0.585623,0.371999,0.4844,-1.035033,0.780717,0.741289,0.863476,...,1.140433,-0.969744,-0.515896,1.920793,2.446307,1.961733,1.354937,-0.97961,-1.477878,-2.885757
9764444,-0.457827,-0.423705,-0.781436,0.419228,-1.724034,0.152102,0.263579,0.478027,0.328611,-0.212661,...,0.846382,-0.773448,-0.176449,0.740858,0.405413,1.185534,0.378821,-0.39484,-0.569151,-1.615188
9764608,-0.059312,0.219149,-0.081878,-0.773509,-0.422638,0.597317,0.265122,0.797211,-0.323674,-0.332679,...,-0.159418,-0.272804,-0.341716,0.386503,0.400671,0.144783,0.418068,1.020015,-0.870527,-0.986105


In [20]:
# Convert the dataframes to numpy arrays
user_vectors = user_matrix_df.values
article_vectors = article_matrix_df.values
interaction_matrix = interaction_matrix_df.values

In [18]:
# Normalize the embeddings (optional, depending on your use case)
# scaler = StandardScaler()
# user_vectors = scaler.fit_transform(user_vectors)
# article_vectors = scaler.transform(article_vectors)

# # Normalize the original interaction matrix
# interaction_matrix_normalized = MinMaxScaler().fit_transform(interaction_matrix)


In [21]:
# Get the indices of the non-zero entries in the interaction matrix
user_idx, article_idx = np.where(np.logical_and(interaction_matrix != 0, interaction_matrix <= 120))
read_times = interaction_matrix[user_idx, article_idx]  # intraction_matrix ->interaction_matrix_normalized

# Create the input features by concatenating user and article vectors
X = np.hstack((user_vectors[user_idx], article_vectors[article_idx]))
y = read_times

In [22]:
map_list= [x for x in range(0,interaction_matrix_df.shape[1])]
column_article_list = list(interaction_matrix_df.columns)
mapped_dict = dict(zip(map_list, column_article_list))

final_list = []
counter = -1
for x in list(article_idx):
    if x in list(mapped_dict.keys()):
        final_list.append(mapped_dict[x])
    else:
        final_list.append(counter)
        counter-=1
len(final_list)

138462

In [23]:
X_mapped = pd.DataFrame(X,index=final_list)
X_mapped= pd.merge(X_mapped,embeddings_df,left_on= X_mapped.index,right_on=embeddings_df.index,how='left').set_index('key_0')

In [24]:
X_mapped.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,document_vector_290,document_vector_291,document_vector_292,document_vector_293,document_vector_294,document_vector_295,document_vector_296,document_vector_297,document_vector_298,document_vector_299
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9749628,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,-0.01943,0.089555,0.041788,0.03452,0.058853,0.016004,0.001067,-0.095511,0.040745,0.008054
9762114,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,0.01444,0.086885,0.021871,0.009744,0.091162,0.044088,0.009685,-0.111595,0.049436,0.020631
9763559,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,0.006224,0.061204,0.010064,0.014875,0.125011,0.048549,0.015806,-0.116512,0.05714,-0.018233
9764444,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,-0.000341,0.077538,0.043465,-0.006887,0.10511,0.042128,-0.030038,-0.052471,0.041763,0.02807
9764608,-0.049423,0.073484,-0.127462,-0.54457,0.004586,-0.028122,-0.050352,0.224211,-0.162141,0.017274,...,0.024433,0.076143,0.019315,0.014449,0.114371,-0.017107,0.009104,-0.099205,0.034433,0.031622


In [25]:
X = X_mapped.values

In [26]:
y

array([102.,  13.,  52., ...,  51., 110.,  44.])

In [27]:
# Save file
X_file_path = f'./files/numpy/X_{type_}_{size}'
y_file_path = f'./files/numpy/y_{type_}_{size}'

np.save(X_file_path,X)
np.save(y_file_path,y)