In [9]:
#change using en_core_web_md
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your keyword data
keywords_df = pd.read_csv('keyword.csv')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Load the spaCy model for word vectors
nlp = spacy.load("en_core_web_md")

# Fit and transform the "text" column to TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(keywords_df['Text'].fillna(''))

# Function to calculate semantic similarity using spaCy
def calculate_similarity(prompt_vector, keyword):
    keyword_vector = nlp(keyword.lower())
    # Check if both vectors are valid (non-zero norm)
    if prompt_vector.vector_norm and keyword_vector.vector_norm:
        return prompt_vector.similarity(keyword_vector)
    else:
        # Return a default similarity score if vectors are empty
        return 0.0

# Function to match top pages
def match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=5):
    # Process the user prompt using spaCy
    processed_prompt = nlp(user_prompt.lower())

    # Initialize a dictionary to store relevance scores
    relevance_scores = {}

    # Iterate over rows in the DataFrame
    for index, row in keywords_df.iterrows():
        page_name = row['Title']
        total_relevance_score = 0
        
        # Check similarity with each keyword
        for i in range(1, 6):
            keyword = row[f'Keyword{i}']
            score = row[f'Score{i}']
            similarity = calculate_similarity(processed_prompt, keyword)
            total_relevance_score += similarity * score
        
        # Store the relevance score
        if page_name in relevance_scores:
            relevance_scores[page_name] += total_relevance_score
        else:
            relevance_scores[page_name] = total_relevance_score

    # Check if no significant matches found
    if not any(relevance_scores.values()):
        # Transform the user prompt to the TF-IDF matrix
        user_tfidf = tfidf_vectorizer.transform([user_prompt])
        
        # Compute cosine similarity between the user prompt and the TF-IDF matrix
        cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        
        # Get top N indices of the most similar documents
        top_indices = cosine_similarities.argsort()[-top_n:][::-1]
        
        for index in top_indices:
            page_name = keywords_df.iloc[index]['Title']
            relevance_score = cosine_similarities[index]
            relevance_scores[page_name] = relevance_score

    # Sort pages by relevance scores in descending order
    sorted_relevance = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top N page names
    top_pages = sorted_relevance[:top_n]
    
    return top_pages

# import numpy as np
# user_prompt = "what transfer oxygen"
# top_n = 3  # Number of top pages to retrieve
# path_length = 3  # Number of path suggest

# top_pages = match_top_pages(user_prompt, keywords_df,tfidf_matrix, top_n=top_n)
# print(top_pages)

In [10]:
import pandas as pd

# Load the data
user_path = pd.read_csv('user_paths.csv')

# Expand the study_path and timestamp columns
expanded_data = []
for _, row in user_path.iterrows():
    study_path = eval(row['page_name'])
    timestamp = eval(row['time_spent'])
    for i in range(len(study_path) - 1):
        current_page = study_path[i]
        next_page = study_path[i + 1]
        time_spent = timestamp[i]
        expanded_data.append([current_page, next_page, time_spent])

# Create a DataFrame with the expanded data
expanded_df = pd.DataFrame(expanded_data, columns=['current_page', 'next_page', 'time_spent'])

# Normalize the time_spent to use as ratings
max_time_spent = expanded_df['time_spent'].max()
expanded_df['rating'] = expanded_df['time_spent'] / max_time_spent

# Prepare the dataset for training
interactions = expanded_df[['current_page', 'next_page', 'rating']].copy()

# Encode current_page and next_page as indices
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on all unique pages
all_pages = pd.concat([interactions['current_page'], interactions['next_page']]).unique()
label_encoder.fit(all_pages)

# # Transform the pages to encoded values using .loc
# interactions.loc[:, 'current_page'] = label_encoder.transform(interactions['current_page'])
# interactions.loc[:, 'next_page'] = label_encoder.transform(interactions['next_page'])


In [11]:
from sklearn.model_selection import train_test_split

# Encode current_page and next_page
interactions['current_page_encoded'] = label_encoder.transform(interactions['current_page'])
interactions['next_page_encoded'] = label_encoder.transform(interactions['next_page'])

# Split data into train and test sets
train_data, test_data = train_test_split(interactions[['current_page_encoded', 'next_page_encoded', 'rating']],
                                         test_size=0.2,
                                         random_state=42)


In [12]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout

# Define dimensions
num_pages = len(all_pages)
embedding_dim = 50 

# Input layers
input_current_page = Input(shape=(1,))
input_next_page = Input(shape=(1,))

# Embedding layers
embedding_layer = Embedding(num_pages, embedding_dim)

# Flatten embeddings
flattened_current_page = Flatten()(embedding_layer(input_current_page))
flattened_next_page = Flatten()(embedding_layer(input_next_page))

# Concatenate embeddings
concatenated = Concatenate()([flattened_current_page, flattened_next_page])

# Dense layers
dense_1 = Dense(128, activation='relu')(concatenated)
dropout = Dropout(0.5)(dense_1)
dense_2 = Dense(64, activation='relu')(dropout)
output_layer = Dense(1, activation='sigmoid')(dense_2)

# Model instantiation
ncf_model = Model(inputs=[input_current_page, input_next_page], outputs=output_layer)

# Compile the model
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [13]:
# Training the model
ncf_model.fit([train_data['current_page_encoded'], train_data['next_page_encoded']],
              train_data['rating'],
              batch_size=64,
              epochs=10,
              validation_data=([test_data['current_page_encoded'], test_data['next_page_encoded']],
                               test_data['rating']))


Epoch 1/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 0.0817 - val_accuracy: 0.0000e+00 - val_loss: 0.0117
Epoch 2/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 0.0114 - val_accuracy: 0.0000e+00 - val_loss: 0.0127
Epoch 3/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0094 - val_accuracy: 0.0000e+00 - val_loss: 0.0129
Epoch 4/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0108 - val_accuracy: 0.0000e+00 - val_loss: 0.0139
Epoch 5/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0102 - val_accuracy: 0.0000e+00 - val_loss: 0.0135
Epoch 6/10
[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 0.0088 - val_accuracy

<keras.src.callbacks.history.History at 0x2130b488340>

In [14]:
import numpy as np

prompt_page = "Blood"
prompt_page_encoded = label_encoder.transform([prompt_page])[0]

# Generate predictions
predicted_pages = ncf_model.predict([prompt_page_encoded * np.ones(num_pages), np.arange(num_pages)])

# Extract confidence scores (probabilities)
confidence_scores = predicted_pages.flatten()

# Sort predictions and get top paths
top_predicted_indices = np.argsort(confidence_scores)[::-1][:3]
predicted_paths = label_encoder.inverse_transform(top_predicted_indices)

# Print result
print("Top predicted indices:", top_predicted_indices)
print("Confidence scores:", confidence_scores[top_predicted_indices])

print(f"Predicted paths for {prompt_page}: {predicted_paths}")


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Top predicted indices: [169 121  49]
Confidence scores: [0.00120573 0.00103697 0.00101869]
Predicted paths for Blood: ['T_cell' 'Protein' 'Gland']


In [15]:
import numpy as np
user_prompt = "what transfer oxygen"
top_n = 3  # Number of top pages to retrieve
path_length = 3  # Number of path suggest

top_pages = match_top_pages(user_prompt, keywords_df,tfidf_matrix, top_n=top_n)


print(f"User prompt: '{user_prompt}'")
if top_pages:
    print(f"Top {top_n} associated page names:")
    for i, (page_name, relevance_score) in enumerate(top_pages, 1):
        print(f"{i}. {page_name} (Relevance Score: {round(relevance_score,3)})")
else:
    print("No relevant pages found.")

print()

for page_name, _ in top_pages:
    prompt_page_encoded = label_encoder.transform([page_name])[0]

    # Generate predictions
    predicted_pages = ncf_model.predict([prompt_page_encoded * np.ones(num_pages), np.arange(num_pages)])

    # Extract confidence scores (probabilities)
    confidence_scores = predicted_pages.flatten()

    # Sort predictions and get top paths
    top_predicted_indices = np.argsort(confidence_scores)[::-1][:path_length]
    predicted_paths = label_encoder.inverse_transform(top_predicted_indices)

    # Print top predicted indices and confidence scores
    print(f"Top predicted indices for {page_name}: {top_predicted_indices}")
    print(f"Confidence scores for {page_name}: {confidence_scores[top_predicted_indices]}")
    print(f"Predicted paths for {page_name}: {predicted_paths}\n")


User prompt: 'what transfer oxygen'
Top 3 associated page names:
1. Blood_plasma (Relevance Score: 1.635)
2. Kidney (Relevance Score: 1.176)
3. Epithelium (Relevance Score: 1.014)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Top predicted indices for Blood_plasma: [169 121 185]
Confidence scores for Blood_plasma: [0.00067581 0.00059122 0.00056517]
Predicted paths for Blood_plasma: ['T_cell' 'Protein' 'Urinary_meatus']

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Top predicted indices for Kidney: [169  20 121]
Confidence scores for Kidney: [5.6164023e-05 5.2894789e-05 5.2703464e-05]
Predicted paths for Kidney: ['T_cell' 'Body_fluid' 'Protein']

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Top predicted indices for Epithelium: [132   5 177]
Confidence scores for Epithelium: [0.00314185 0.003095   0.00302661]
Predicted paths for Epithelium: ['Reproductive_rights' 'Apoptosis' 'Trachea']



In [8]:
# previous method, only match keyword

# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Load your keyword data
# keywords_df = pd.read_csv('keyword.csv')

# # Initialize the TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Fit and transform the "text" column to TF-IDF matrix
# tfidf_matrix = tfidf_vectorizer.fit_transform(keywords_df['Text'].fillna(''))

# # Example function for keyword matching with multiple results
# def match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=5):
#     # Example preprocessing (you may need to customize this based on your data)
#     processed_prompt = user_prompt.lower()  # Convert to lowercase
#     # Example keyword extraction (you may use more sophisticated methods here)
#     keywords = processed_prompt.split()  # Simple split by whitespace
    
#     # Initialize a dictionary to store relevance scores
#     relevance_scores = {}
    
#     # Iterate over keywords and match against keywords in the dataset
#     for keyword in keywords:
#         # Filter rows where keyword appears in any keyword column
#         matches = keywords_df[keywords_df.apply(lambda x: keyword in x.values, axis=1)]
        
#         # Calculate relevance scores based on your Score columns
#         for index, row in matches.iterrows():
#             # Example: sum up scores for simplicity
#             relevance_score = row[['Score1', 'Score2', 'Score3', 'Score4', 'Score5']].sum()
#             page_name = row['Title']  # Get the page name
#             if page_name in relevance_scores:
#                 relevance_scores[page_name] += relevance_score
#             else:
#                 relevance_scores[page_name] = relevance_score
    
#     # If no matches found in keywords, search in the "text" column using TF-IDF
#     if not relevance_scores:
#         # Transform the user prompt to the TF-IDF matrix
#         user_tfidf = tfidf_vectorizer.transform([user_prompt])
        
#         # Compute cosine similarity between the user prompt and the TF-IDF matrix
#         cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        
#         # Get top N indices of the most similar documents
#         top_indices = cosine_similarities.argsort()[-top_n:][::-1]
        
#         for index in top_indices:
#             page_name = keywords_df.iloc[index]['Title']
#             relevance_score = cosine_similarities[index]
#             relevance_scores[page_name] = relevance_score
    
#     # Sort page names by relevance scores in descending order
#     sorted_pages = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
#     # Return the top N page names
#     top_pages = sorted_pages[:top_n]
    
#     return top_pages

# # # Example usage:
# # user_prompt = "what is core of human body"
# # top_n = 3  # Number of top pages to retrieve
# # top_pages = match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=top_n)

# # print(f"User prompt: '{user_prompt}'")
# # if top_pages:
# #     print(f"Top {top_n} associated page names:")
# #     for i, (page_name, relevance_score) in enumerate(top_pages, 1):
# #         print(f"{i}. {page_name} (Relevance Score: {round(relevance_score,3)})")
# # else:
# #     print("No relevant pages found.")