In [1]:
!pip install faiss-gpu




In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import csv
import os
from datetime import datetime
from transformers import pipeline
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from IPython.display import Markdown
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [2]:
import re

def split_by_theme(text):
    pattern = re.compile(r'(theme_\w+)\b') 
    matches = list(pattern.finditer(text))
    theme_texts = {}

    for i, match in enumerate(matches):
        theme_name = match.group(1)
        start_pos = match.end()  
        if i + 1 < len(matches):
            end_pos = matches[i + 1].start()
        else:
            end_pos = len(text)
        
        content = text[start_pos:end_pos].strip()
        
        theme_texts[theme_name] = content
    
    return theme_texts


def add_theme_to_chunk(chunk, theme_name):
    return f'{theme_name}: {chunk}'




In [3]:
def process_themes(document):
    theme_texts = split_by_theme(document)
    all_chunks = []

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    i=0
    for theme, text in theme_texts.items():
        
        chunks = text_splitter.split_text(text)
        for chunk in chunks:
            all_chunks.append(add_theme_to_chunk(chunk, theme))
        

    return all_chunks

with open('domainknowledge.txt', 'r') as file:
    document = file.read()

chunks = process_themes(document)



#best working embedder so far is BAAI bge en

In [4]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('BAAI/bge-large-en-v1.5')
domainEmbeddings = model.encode(chunks, normalize_embeddings=True)
domainEmbeddings = model.encode(chunks, convert_to_numpy=True).astype(np.float32)

dimension = domainEmbeddings.shape[1]  
index = faiss.IndexFlatL2(dimension) 

index.add(domainEmbeddings)

faiss.write_index(index, 'faiss_index.index')

with open('embeddings.pkl', 'wb') as f:
    pickle.dump({'embeddings': domainEmbeddings, 'chunks': chunks}, f)

print("Index and embeddings saved successfully.")
print("Index and embeddings saved successfully.")













Index and embeddings saved successfully.
Index and embeddings saved successfully.


In [5]:
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer

index = faiss.read_index('faiss_index.index')  
with open('embeddings.pkl', 'rb') as f:
    data = pickle.load(f)
    embeddings = data['embeddings']
    chunks = data['chunks']





In [22]:

from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.index import create_in
from whoosh.qparser import QueryParser
from whoosh import scoring
from whoosh.analysis import StemmingAnalyzer
from io import StringIO
import tempfile
import shutil
import os

def search_chunks(chunks, query_str):
    schema = Schema(idx=NUMERIC(stored=True), content=TEXT(stored=True, analyzer=StemmingAnalyzer()))
    tempdir = tempfile.mkdtemp()
    try:
        ix = create_in(tempdir, schema)
        writer = ix.writer()
        
        for i, chunk in enumerate(chunks):
            writer.add_document(idx=i, content=chunk)
        writer.commit()
        
        
        with ix.searcher(weighting=scoring.BM25F()) as searcher:
            parser = QueryParser("content", schema=ix.schema)
            query = parser.parse(query_str)
            results = searcher.search(query, limit=None)
            return [hit['idx'] for hit in results]
    finally:
        shutil.rmtree(tempdir)

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
query = " physiotherapist "
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([document])
feature_names = np.array(vectorizer.get_feature_names_out())
tfidf_scores = tfidf_matrix.toarray()[0]
threshold = np.percentile(tfidf_scores, 88)
rarest_words = set(feature_names[tfidf_scores <= threshold])
print(rarest_words)
query_words = query.strip().lower().split()
print(query_words)
rare_query_words = [w for w in query_words if w in rarest_words]
matching_indices = []
for word in rare_query_words:
    indices = search_chunks(chunks, word)
    matching_indices.extend(indices)
print("Rarest words in the query:", rare_query_words)
print("Matching indices:", matching_indices)




['physiotherapist']
Rarest words in the query: ['physiotherapist']
Matching indices: [132, 133, 130]


In [18]:
def remove_duplicates(lst):
    seen = set()
    result = []
    for num in lst:
        if num not in seen:
            result.append(num)
            seen.add(num)
    return result

In [19]:
query2 = "when was let's talk genai ran "

query_embedding = model.encode([query2], convert_to_numpy=True).astype(np.float32)


k = 5  
distances, indices = index.search(query_embedding, k)
retrieved_chunks = [chunks[i] for i in indices[0]]

print("___")
final_indices=matching_indices
for i in indices[0]:
    final_indices.append(i)
print(final_indices)

print("___")
final_indices2=remove_duplicates(final_indices)

print(final_indices2)
print("Query:", query)
print("\nTop 3 retrieved chunks:")

for i, chunk in enumerate(retrieved_chunks):

    relevance_score = 1 / (1 + distances[0][i])  
    print(f"- {chunk}")
    print(f"  Relevance Score: {relevance_score:.4f}")  
    print("____")

___
[125, 69, 0, 67, 10]
___
[125, 69, 0, 67, 10]
Query: What was the name of the professor of physiotherapy

Top 3 retrieved chunks:
- theme_five: to empathize with their perspectives and bridge the gap between us, the educators and them, the students. By actively listening to their experiences, we can continue to refine and improve our teaching and assessment to better serve the educational journey. In response to an appetite for professional development on generative AI, colleagues from UL Library, the Centre for Transformative Learning, Academic Integrity, and the Information Technology Division, designed and developed a five day online learning experience open to staff from all divisions across the university. Our course called Let's Talk Gen AI ran over a five day period in May 2024. Each day of the course was aligned with a different theme, Gen AI Work, Gen AI for teaching and assessment, Gen AI for research. The course was bookended with an introductory day at the start and a r