In [19]:
import pandas as pd

# Load your keyword data
keywords = pd.read_csv('keyword.csv')

# Example function for keyword matching with multiple results
def match_top_pages(user_prompt, keywords_df, top_n=5):
    # Example preprocessing (you may need to customize this based on your data)
    processed_prompt = user_prompt.lower()  # Convert to lowercase
    # Example keyword extraction (you may use more sophisticated methods here)
    keywords = processed_prompt.split()  # Simple split by whitespace
    
    # Initialize a dictionary to store relevance scores
    relevance_scores = {}
    
    # Iterate over keywords and match against keywords in the dataset
    for keyword in keywords:
        # Filter rows where keyword appears in any keyword column
        matches = keywords_df[keywords_df.apply(lambda x: keyword in x.values, axis=1)]
        
        # Calculate relevance scores based on your Score columns
        for index, row in matches.iterrows():
            # Example: sum up scores for simplicity
            relevance_score = row[['Score1', 'Score2', 'Score3', 'Score4', 'Score5']].sum()
            page_name = row['Title']  # Get the page name
            if page_name in relevance_scores:
                relevance_scores[page_name] += relevance_score
            else:
                relevance_scores[page_name] = relevance_score
    
    # Sort page names by relevance scores in descending order
    sorted_pages = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top N page names
    top_pages = sorted_pages[:top_n]
    
    return top_pages

# # Example usage:
# user_prompt = "I want to learn about blood vessels"
# top_n = 3  # Number of top pages to retrieve
# top_pages = match_top_pages(user_prompt, keywords, top_n=top_n)

# print(f"User prompt: '{user_prompt}'")
# if top_pages:
#     print(f"Top {top_n} associated page names:")
#     for i, (page_name, relevance_score) in enumerate(top_pages, 1):
#         print(f"{i}. {page_name} (Relevance Score: {relevance_score})")
# else:
#     print("No relevant pages found.")


In [20]:
import pandas as pd

# Load the data
user_path = pd.read_csv('user_paths.csv')

# Expand the study_path and timestamp columns
expanded_data = []
for _, row in user_path.iterrows():
    study_path = eval(row['page_name'])
    timestamp = eval(row['time_spent'])
    for i in range(len(study_path) - 1):
        current_page = study_path[i]
        next_page = study_path[i + 1]
        time_spent = timestamp[i]
        expanded_data.append([current_page, next_page, time_spent])

# Create a DataFrame with the expanded data
expanded_df = pd.DataFrame(expanded_data, columns=['current_page', 'next_page', 'time_spent'])

# Normalize the time_spent to use as ratings
max_time_spent = expanded_df['time_spent'].max()
expanded_df['rating'] = expanded_df['time_spent'] / max_time_spent

# Prepare the dataset for training
interactions = expanded_df[['current_page', 'next_page', 'rating']].copy()

# Encode current_page and next_page as indices
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on all unique pages
all_pages = pd.concat([interactions['current_page'], interactions['next_page']]).unique()
label_encoder.fit(all_pages)

# # Transform the pages to encoded values using .loc
# interactions.loc[:, 'current_page'] = label_encoder.transform(interactions['current_page'])
# interactions.loc[:, 'next_page'] = label_encoder.transform(interactions['next_page'])


In [21]:
from sklearn.model_selection import train_test_split

# Encode current_page and next_page
interactions['current_page_encoded'] = label_encoder.transform(interactions['current_page'])
interactions['next_page_encoded'] = label_encoder.transform(interactions['next_page'])

# Split data into train and test sets
train_data, test_data = train_test_split(interactions[['current_page_encoded', 'next_page_encoded', 'rating']],
                                         test_size=0.2,
                                         random_state=42)


In [22]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout

# Define dimensions
num_pages = len(all_pages)
embedding_dim = 50  # adjust as needed

# Input layers
input_current_page = Input(shape=(1,))
input_next_page = Input(shape=(1,))

# Embedding layers
embedding_layer = Embedding(num_pages, embedding_dim)

# Flatten embeddings
flattened_current_page = Flatten()(embedding_layer(input_current_page))
flattened_next_page = Flatten()(embedding_layer(input_next_page))

# Concatenate embeddings
concatenated = Concatenate()([flattened_current_page, flattened_next_page])

# Dense layers
dense_1 = Dense(128, activation='relu')(concatenated)
dropout = Dropout(0.5)(dense_1)
dense_2 = Dense(64, activation='relu')(dropout)
output_layer = Dense(1, activation='sigmoid')(dense_2)

# Model instantiation
ncf_model = Model(inputs=[input_current_page, input_next_page], outputs=output_layer)

# Compile the model
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [23]:
# Training the model
ncf_model.fit([train_data['current_page_encoded'], train_data['next_page_encoded']],
              train_data['rating'],
              batch_size=64,
              epochs=10,
              validation_data=([test_data['current_page_encoded'], test_data['next_page_encoded']],
                               test_data['rating']))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ef591121c0>

In [24]:
# Example prediction
import numpy as np 
prompt_page = "Aorta"
prompt_page_encoded = label_encoder.transform([prompt_page])[0]

# Generate predictions
predicted_pages = ncf_model.predict([prompt_page_encoded * np.ones(num_pages), np.arange(num_pages)])

# Extract confidence scores (probabilities)
confidence_scores = predicted_pages.flatten()

# Sort predictions and get top paths
top_predicted_indices = np.argsort(confidence_scores)[::-1][:3]
predicted_paths = label_encoder.inverse_transform(top_predicted_indices)

# Print top predicted indices and confidence scores
print("Top predicted indices:", top_predicted_indices)
print("Confidence scores:", confidence_scores[top_predicted_indices])

print(f"Predicted paths for {prompt_page}: {predicted_paths}")


Top predicted indices: [ 70 151  79]
Confidence scores: [0.5979775  0.5977445  0.59742916]
Predicted paths for Aorta: ['Loose_connective_tissue' 'Stratum_spinosum' 'Melanocyte']


In [25]:

#the input
user_prompt = "I want to learn about nose"
top_n = 3  # Number of top pages to retrieve

top_pages = match_top_pages(user_prompt, keywords, top_n=top_n)


print(f"User prompt: '{user_prompt}'")
if top_pages:
    print(f"Top {top_n} associated page names:")
    for i, (page_name, relevance_score) in enumerate(top_pages, 1):
        print(f"{i}. {page_name} (Relevance Score: {round(relevance_score,3)})")
else:
    print("No relevant pages found.")

for page_name, _ in top_pages:
    prompt_page_encoded = label_encoder.transform([page_name])[0]

    # Generate predictions
    predicted_pages = ncf_model.predict([prompt_page_encoded * np.ones(num_pages), np.arange(num_pages)])

    # Extract confidence scores (probabilities)
    confidence_scores = predicted_pages.flatten()

    # Sort predictions and get top paths
    top_predicted_indices = np.argsort(confidence_scores)[::-1][:3]
    predicted_paths = label_encoder.inverse_transform(top_predicted_indices)

    # Print top predicted indices and confidence scores
    print(f"Top predicted indices for {page_name}: {top_predicted_indices}")
    print(f"Confidence scores for {page_name}: {confidence_scores[top_predicted_indices]}")
    print(f"Predicted paths for {page_name}: {predicted_paths}\n")


User prompt: 'I want to learn about nose'
Top 3 associated page names:
1. Nasal_cavity (Relevance Score: 2.679)
2. Nostril (Relevance Score: 2.653)
3. Nose (Relevance Score: 2.218)
Top predicted indices for Nasal_cavity: [154  78 133]
Confidence scores for Nasal_cavity: [0.7194531  0.69009405 0.6655768 ]
Predicted paths for Nasal_cavity: ['Superior_vena_cava' 'Medulla_oblongata' 'Skeletal_animation']

Top predicted indices for Nostril: [161 157 143]
Confidence scores for Nostril: [0.6937589  0.67029685 0.65269613]
Predicted paths for Nostril: ['Thoracic_diaphragm' 'Sympathetic_nervous_system' 'Spinal_nerve']

Top predicted indices for Nose: [ 78 104 154]
Confidence scores for Nose: [0.7389342 0.6825729 0.6771903]
Predicted paths for Nose: ['Medulla_oblongata' 'Pharynx' 'Superior_vena_cava']

