In [1]:
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard
from tqdm.keras import TqdmCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
# Step 1: Create User DataFrame (user_df)
num_users = 1590
user_df = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(num_users, 300)), 
                       index=np.arange(num_users), columns=np.arange(300))

In [3]:
user_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.049363,-0.303704,-0.480393,0.078527,-0.773868,0.450385,-0.519743,-0.228698,-0.662038,0.881668,...,0.49497,0.091419,0.750647,0.053988,0.519939,-0.033637,-0.013388,-0.777252,-0.347181,-0.50738
1,0.036395,0.486561,0.254477,-0.67474,-0.746515,-0.792668,0.359735,0.177707,-0.979102,0.74285,...,-0.691847,0.262373,-0.698496,-0.269646,-0.266737,0.078235,0.54072,0.840582,0.309625,0.147902


In [4]:

# Step 2: Create Article DataFrame (article_df)
num_articles = 4247
article_df = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(num_articles, 300)), 
                          index=np.arange(num_articles), columns=np.arange(300))


In [5]:
article_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.073355,0.664991,-0.137629,-0.730518,-0.741497,-0.020195,0.009307,0.65493,-0.492967,0.433702,...,0.254606,-0.238789,-0.831218,0.904384,0.120009,-0.204506,0.381561,-0.826842,0.191217,-0.737217
1,0.985746,-0.684073,-0.731512,-0.967079,0.161351,0.702799,-0.665756,-0.53187,0.230363,-0.277553,...,-0.370847,0.857662,-0.54192,-0.778803,-0.320718,0.082687,0.605517,0.527307,0.3912,-0.030112


In [6]:
# Step 3: Create Interaction Matrix DataFrame (interaction_matrix_df) minutes 
interaction_matrix_df = pd.DataFrame(np.random.randint(0, 101, size=(num_users, num_articles)), 
                                     index=np.arange(num_users), columns=np.arange(num_articles))

In [7]:
interaction_matrix_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4237,4238,4239,4240,4241,4242,4243,4244,4245,4246
0,63,22,19,9,52,7,18,58,28,20,...,21,17,76,44,19,100,2,21,18,4
1,16,36,96,99,50,47,88,78,100,70,...,54,37,30,69,99,0,100,28,45,73


In [8]:

# Step 4: Convert DataFrames to numpy arrays
user_vectors = user_df.values  # Shape: (1590, 300)
article_vectors = article_df.values  # Shape: (4247, 300)
interaction_matrix = interaction_matrix_df.values  # Shape: (1590, 4247)

In [9]:
# Normalize the original interaction matrix
interaction_matrix_normalized = MinMaxScaler().fit_transform(interaction_matrix)


In [10]:
# Step 5: Get indices of non-zero entries in the interaction matrix
user_idx, article_idx = np.where(interaction_matrix_normalized != 0)
read_times = interaction_matrix_normalized[user_idx[:20000], article_idx[:20000]] # choose 20000 because the result is 60000

In [11]:
# Check shapes before concatenation
print("user_vectors[user_idx] shape:", user_vectors[user_idx[:20000]].shape)
print("article_vectors[article_idx] shape:", article_vectors[article_idx[:20000]].shape)

user_vectors[user_idx] shape: (20000, 300)
article_vectors[article_idx] shape: (20000, 300)


In [12]:
# Step 6: Split the data into train and test sets with test size of 20%
X = np.hstack((user_vectors[user_idx[:20000]], article_vectors[article_idx[:20000]]))
y = read_times

In [13]:
print(X.shape)
print(y.shape)

(20000, 600)
(20000,)


In [14]:
sample = 100
X = X[:sample]
y = y[:sample]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (80, 600)
X_test shape: (20, 600)
y_train shape: (80,)
y_test shape: (20,)


In [16]:
# y_train = y_train.reshape(-1, 1)
# y_test = y_test.reshape(-1, 1)


In [17]:
# print("y_train shape:", y_train.shape)
# print("y_test shape:", y_test.shape)

In [18]:
# Define the model
model = Sequential([
    tf.keras.Input(shape=(600,)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Prepare TensorBoard callback
log_dir = "files/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Train the model
num_epochs = 2
history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=16,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard_callback, TqdmCallback(verbose=1)],
                    verbose=2)



0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Epoch 1/2
5/5 - 1s - loss: 0.3503 - mse: 0.3503 - val_loss: 0.1244 - val_mse: 0.1244
Epoch 2/2
5/5 - 0s - loss: 0.1266 - mse: 0.1266 - val_loss: 0.1452 - val_mse: 0.1452
