In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your keyword data
keywords_df = pd.read_csv('keyword.csv')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the "text" column to TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(keywords_df['Text'].fillna(''))

# Example function for keyword matching with multiple results
def match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=5):
    # Example preprocessing (you may need to customize this based on your data)
    processed_prompt = user_prompt.lower()  # Convert to lowercase
    # Example keyword extraction (you may use more sophisticated methods here)
    keywords = processed_prompt.split()  # Simple split by whitespace
    
    # Initialize a dictionary to store relevance scores
    relevance_scores = {}
    
    # Iterate over keywords and match against keywords in the dataset
    for keyword in keywords:
        # Filter rows where keyword appears in any keyword column
        matches = keywords_df[keywords_df.apply(lambda x: keyword in x.values, axis=1)]
        
        # Calculate relevance scores based on your Score columns
        for index, row in matches.iterrows():
            # Example: sum up scores for simplicity
            relevance_score = row[['Score1', 'Score2', 'Score3', 'Score4', 'Score5']].sum()
            page_name = row['Title']  # Get the page name
            if page_name in relevance_scores:
                relevance_scores[page_name] += relevance_score
            else:
                relevance_scores[page_name] = relevance_score
    
    # If no matches found in keywords, search in the "text" column using TF-IDF
    if not relevance_scores:
        # Transform the user prompt to the TF-IDF matrix
        user_tfidf = tfidf_vectorizer.transform([user_prompt])
        
        # Compute cosine similarity between the user prompt and the TF-IDF matrix
        cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
        
        # Get top N indices of the most similar documents
        top_indices = cosine_similarities.argsort()[-top_n:][::-1]
        
        for index in top_indices:
            page_name = keywords_df.iloc[index]['Title']
            relevance_score = cosine_similarities[index]
            relevance_scores[page_name] = relevance_score
    
    # Sort page names by relevance scores in descending order
    sorted_pages = sorted(relevance_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top N page names
    top_pages = sorted_pages[:top_n]
    
    return top_pages

# Example usage:
user_prompt = "what is core of human body"
top_n = 3  # Number of top pages to retrieve
top_pages = match_top_pages(user_prompt, keywords_df, tfidf_matrix, top_n=top_n)

print(f"User prompt: '{user_prompt}'")
if top_pages:
    print(f"Top {top_n} associated page names:")
    for i, (page_name, relevance_score) in enumerate(top_pages, 1):
        print(f"{i}. {page_name} (Relevance Score: {round(relevance_score,3)})")
else:
    print("No relevant pages found.")

User prompt: 'what is core of human body'
Top 3 associated page names:
1. Brain (Relevance Score: 0.142)
2. Tissue_(biology) (Relevance Score: 0.139)
3. Blood (Relevance Score: 0.133)
