In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your keyword data
keywords_df = pd.read_csv('keyword.csv')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the "text" column to TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(keywords_df['Text'].fillna(''))


def match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=5):
    processed_prompt = user_prompt.lower()  #lowercase
    keywords = processed_prompt.split()  # split by whitespace
    
    relevance_scores = {}
    
    # loop match keywords
    for keyword in keywords:
        # Filter rows where keyword appears in any keyword column
        matches = keywords_df[keywords_df.apply(lambda x: keyword in x.values, axis=1)]
        
        # Calculate relevance scores 
        for index, row in matches.iterrows():
            relevance_score = row[['Score1', 'Score2', 'Score3', 'Score4', 'Score5']].sum()
            page_name = row['Title']  # Get the page name
            if page_name in relevance_scores:
                relevance_scores[page_name] += relevance_score
            else:
                relevance_scores[page_name] = relevance_score
    
    # If no matches found in keywords, search in the "text" column using TF-IDF
    if not relevance_scores:
        # convert to matrix
        user_tfidf = tfidf_vectorizer.transform([user_prompt])
        
        #cosine similarity between  user prompt and TF-IDF matrix
        cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        
        # Get top N indices of the most similar documents
        top_indices = cosine_similarities.argsort()[-top_n:][::-1]
        
        for index in top_indices:
            page_name = keywords_df.iloc[index]['Title']
            relevance_score = cosine_similarities[index]
            relevance_scores[page_name] = relevance_score
    
    sorted_pages = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    top_pages = sorted_pages[:top_n]
    
    return top_pages

user_prompt = "heart"
top_n = 3  # Number of top pages to retrieve
top_pages = match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=top_n)

print(f"User prompt: '{user_prompt}'")
if top_pages:
    print(f"Top {top_n} associated page names:")
    for i, (page_name, relevance_score) in enumerate(top_pages, 1):
        print(f"{i}. {page_name} (Relevance Score: {round(relevance_score,3)})")
else:
    print("No relevant pages found.")

User prompt: 'heart'
Top 3 associated page names:
1. Heart (Relevance Score: 0.412)
2. Cardiac_muscle (Relevance Score: 0.227)
3. Circulatory_system (Relevance Score: 0.197)


In [14]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_md")

# Process the user prompt
user_prompt = "I want to learn about blood vessels"
processed_prompt = nlp(user_prompt.lower())
# Sample keywords DataFrame
def calculate_similarity(prompt_vector, keyword):
    keyword_vector = nlp(keyword.lower())
    return prompt_vector.similarity(keyword_vector)

data = {
    'Title': ['Page1', 'Page2', 'Page3'],
    'Keyword_1': ['blood', 'vessels', 'heart'],
    'Score1': [0.8, 0.9, 0.7],
    'Keyword_2': ['circulation', 'veins', 'artery'],
    'Score2': [0.6, 0.8, 0.9],
    'Keyword_3': ['artery', 'capillaries', 'cardiovascular'],
    'Score3': [0.7, 0.6, 0.8],
    'Keyword_4': ['vein', 'blood flow', 'pulse'],
    'Score4': [0.5, 0.7, 0.6],
    'Keyword_5': ['capillary', 'blood vessel', 'blood pressure'],
    'Score5': [0.4, 0.5, 0.7]
}
keywords_df = pd.DataFrame(data)

# Initialize a dictionary to store relevance scores
relevance_scores = {}

# Iterate over rows in the DataFrame
for index, row in keywords_df.iterrows():
    page_name = row['Title']
    total_relevance_score = 0
    
    # Check similarity with each keyword
    for i in range(1, 6):
        keyword = row[f'Keyword_{i}']
        score = row[f'Score{i}']
        similarity = calculate_similarity(processed_prompt, keyword)
        total_relevance_score += similarity * score
    
    # Store the relevance score
    if page_name in relevance_scores:
        relevance_scores[page_name] += total_relevance_score
    else:
        relevance_scores[page_name] = total_relevance_score
        
# Sort pages by relevance scores
sorted_relevance = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)

# Print sorted relevance scores
for page, score in sorted_relevance:
    print(f"{page}: {score}")



Page2: 0.8012697309925455
Page3: 0.7605744071250108
Page1: 0.5097436192149191


  return prompt_vector.similarity(keyword_vector)
