In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import json

# Open the JSON file
with open("all_data.json", "r") as f:
    all_data = json.load(f)

# For interchangeability 
"""
class SearchStrategy:
    def search(self, query: str):
        raise NotImplementedError
"""

# Class for keyword search
class KeywordSearch:
    def __init__(self, docs, doc_sources):
        self.vectorizer = TfidfVectorizer() # Create TF-IDF vectoriser
        self.docs = docs
        self.doc_sources = doc_sources
        self.tfidf_matrix = self.vectorizer.fit_transform(docs) # Transforms the content into TF-IDF matrix

    def search(self, query, top_k=3, snippet_length=300):
        query_vector = self.vectorizer.transform([query]) # Vectorise query
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten() # Compare query vector with document vector
        top_indices = similarities.argsort()[::-1][:top_k] # Arrange similarity scores from most to least similar and take first 3 indices
        
        results = [] # Initialise results
        for i in top_indices:
            doc = self.docs[i] # Take the top i most similar documents
            source = self.doc_sources[i]

            # Preprocess by converting all to lowercase
            query_words = [
                word for word in query.lower().split()
                if word not in ENGLISH_STOP_WORDS
            ]
            doc_lower = doc.lower()

            match_index = -1 # Initialise (-1 when no match is found)
            for word in query_words:
                match_index = doc_lower.find(word) # Find the position of the word in the document
                if match_index != -1:
                    break
        
            if match_index == -1: # If no match is found
                snippet = doc[:snippet_length]  # Return the first snippet_length characters of the document
            else:
                start = max(0, match_index - snippet_length // 2) 
                end = min(len(doc), start + snippet_length)
                snippet = doc[start:end].strip()

            results.append((snippet, source))
    
        return results

# Class for Semantic search
"""
Code for semantic search
"""

# Switching between search methods
"""
def get_search_strategy(strategy_name, **kwargs):
    if strategy_name == "keyword":
        return KeywordSearch(**kwargs)
    elif strategy_name == "semantic":
        return SemanticSearch(**kwargs)
"""

# Prepare for searching
docs = [doc['content'] for doc in all_data]
doc_sources = [doc['source'] for doc in all_data]

# Using keyword search
searcher = KeywordSearch(docs, doc_sources)
results = searcher.search("What should I put in my technical diary?")

# Print results (just for troubleshooting)
for idx, (snippet, source) in enumerate(results, 1):
    print(f"\nResult {idx}:")
    print(f"Source: {source}")
    print("Snippet:")
    print(snippet)
    print("-" * 50)


Result 1:
Source: https://wehi-researchcomputing.github.io/faq#what-should-i-put-in-my-technical-diary
Snippet:
What should I put in my technical diary? Within your project channel in Sharepoint (ie. within the WEHI-wide student intern group in Teams > Files), you should have a Technical notes folder for your intake eg. Semester 2 2023 Technical notes. This is to help future students understand what you did i
--------------------------------------------------

Result 2:
Source: https://wehi-researchcomputing.github.io/faq#what-should-i-put-in-the-public-wiki
Snippet:
? The public wiki is to share information with future students the things you needed to understand at a high level to be able to start working on the technical side of the project. General knowledge to put into the wiki would include (among others): A diagram showing how the high-level parts of the
--------------------------------------------------

Result 3:
Source: https://wehi-researchcomputing.github.io/faq#how-should