In [22]:
import pandas as pd

# Load the data
user_path = pd.read_csv('user_paths.csv')

# Expand the study_path and timestamp columns
expanded_data = []
for _, row in user_path.iterrows():
    study_path = eval(row['page_name'])
    timestamp = eval(row['time_spent'])
    for i in range(len(study_path) - 1):
        current_page = study_path[i]
        next_page = study_path[i + 1]
        time_spent = timestamp[i]
        expanded_data.append([current_page, next_page, time_spent])

# Create a DataFrame with the expanded data
expanded_df = pd.DataFrame(expanded_data, columns=['current_page', 'next_page', 'time_spent'])

# Normalize the time_spent to use as ratings
max_time_spent = expanded_df['time_spent'].max()
expanded_df['rating'] = expanded_df['time_spent'] / max_time_spent

# Prepare the dataset for training
interactions = expanded_df[['current_page', 'next_page', 'rating']].copy()

# Encode current_page and next_page as indices
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on all unique pages
all_pages = pd.concat([interactions['current_page'], interactions['next_page']]).unique()
label_encoder.fit(all_pages)

# # Transform the pages to encoded values using .loc
# interactions.loc[:, 'current_page'] = label_encoder.transform(interactions['current_page'])
# interactions.loc[:, 'next_page'] = label_encoder.transform(interactions['next_page'])


In [23]:
from sklearn.model_selection import train_test_split

# Encode current_page and next_page
interactions['current_page_encoded'] = label_encoder.transform(interactions['current_page'])
interactions['next_page_encoded'] = label_encoder.transform(interactions['next_page'])

# Split data into train and test sets
train_data, test_data = train_test_split(interactions[['current_page_encoded', 'next_page_encoded', 'rating']],
                                         test_size=0.2,
                                         random_state=42)


In [24]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout

# Define dimensions
num_pages = len(all_pages)
embedding_dim = 50  # adjust as needed

# Input layers
input_current_page = Input(shape=(1,))
input_next_page = Input(shape=(1,))

# Embedding layers
embedding_layer = Embedding(num_pages, embedding_dim)

# Flatten embeddings
flattened_current_page = Flatten()(embedding_layer(input_current_page))
flattened_next_page = Flatten()(embedding_layer(input_next_page))

# Concatenate embeddings
concatenated = Concatenate()([flattened_current_page, flattened_next_page])

# Dense layers
dense_1 = Dense(128, activation='relu')(concatenated)
dropout = Dropout(0.5)(dense_1)
dense_2 = Dense(64, activation='relu')(dropout)
output_layer = Dense(1, activation='sigmoid')(dense_2)

# Model instantiation
ncf_model = Model(inputs=[input_current_page, input_next_page], outputs=output_layer)

# Compile the model
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [25]:
# Training the model
ncf_model.fit([train_data['current_page_encoded'], train_data['next_page_encoded']],
              train_data['rating'],
              batch_size=64,
              epochs=10,
              validation_data=([test_data['current_page_encoded'], test_data['next_page_encoded']],
                               test_data['rating']))


Epoch 1/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0856 - val_accuracy: 0.0000e+00 - val_loss: 0.0121
Epoch 2/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0099 - val_accuracy: 0.0000e+00 - val_loss: 0.0130
Epoch 3/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0090 - val_accuracy: 0.0000e+00 - val_loss: 0.0133
Epoch 4/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0093 - val_accuracy: 0.0000e+00 - val_loss: 0.0144
Epoch 5/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0094 - val_accuracy: 0.0000e+00 - val_loss: 0.0140
Epoch 6/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0084 - val_accuracy

<keras.src.callbacks.history.History at 0x1be00183fa0>

In [26]:
import numpy as np
# Example prediction
prompt_page = "Cardiac_muscle"
prompt_page_encoded = label_encoder.transform([prompt_page])[0]

# Generate predictions
predicted_pages = ncf_model.predict([prompt_page_encoded * np.ones(num_pages), np.arange(num_pages)])

# Sort predictions and get top paths
top_predicted_indices = np.argsort(predicted_pages.flatten())[::-1][:3]
predicted_paths = label_encoder.inverse_transform(top_predicted_indices)

print(f"Predicted paths for {prompt_page}:, {predicted_paths}")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Predicted paths for Cardiac_muscle:, ['Cecum' 'Loose_connective_tissue' 'Hypothalamus']


In [27]:
#optional testing for more diverse result

import numpy as np

# Example function to generate diverse paths using model predictions
def generate_diverse_paths_from_model(model, prompt_page, label_encoder, num_paths=5):
    prompt_page_encoded = label_encoder.transform([prompt_page])[0]

    # Get model predictions for the prompt page
    predicted_pages = model.predict([prompt_page_encoded * np.ones(len(label_encoder.classes_)), np.arange(len(label_encoder.classes_))])

    # Normalize predictions to probabilities
    predicted_probs = predicted_pages.flatten() / np.sum(predicted_pages.flatten())

    # Sample diverse paths based on model predictions
    diverse_paths = []

    for _ in range(num_paths):
        # Diversifying sampling using normalized probabilities
        sampled_indices = np.random.choice(len(label_encoder.classes_), size=4, replace=False, p=predicted_probs)
        sampled_pages = label_encoder.inverse_transform(sampled_indices)
        diverse_paths.append(sampled_pages)

    return diverse_paths

# Example usage
prompt_page = "Aorta"
diverse_predicted_paths = generate_diverse_paths_from_model(ncf_model, prompt_page, label_encoder, num_paths=5)

print("Diverse predicted paths:")
for i, path in enumerate(diverse_predicted_paths):
    print(f"Path {i+1}: {path}")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Diverse predicted paths:
Path 1: ['Integumentary_system' 'Digestion' 'Spinal_cord' 'Thymus']
Path 2: ['Respiratory_System' 'Melanocyte' 'Lymphocyte'
 'Urinary_anti-infective_agent']
Path 3: ['Tonsil' 'Root_of_the_lung' 'Reproductive_success' 'Keratinocyte']
Path 4: ['Basement_membrane' 'Appendicular_skeleton' 'Programmed_cell_death'
 'The_Skeleton_Key']
Path 5: ['White_blood_cell' 'Lung' 'Skeleton_in_the_closet' 'Reproductive_success']
