In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/trainjsonl/phrases_no_exclude_train.jsonl
/kaggle/input/textbooks/First_Aid_Step1.txt
/kaggle/input/textbooks/Neurology_Adams.txt
/kaggle/input/textbooks/Gynecology_Novak.txt
/kaggle/input/textbooks/InternalMed_Harrison.txt
/kaggle/input/textbooks/Immunology_Janeway.txt
/kaggle/input/textbooks/Biochemistry_Lippincott.txt
/kaggle/input/textbooks/Histology_Ross.txt
/kaggle/input/textbooks/Physiology_Levy.txt
/kaggle/input/textbooks/Cell_Biology_Alberts.txt
/kaggle/input/textbooks/First_Aid_Step2.txt
/kaggle/input/textbooks/Pathology_Robbins.txt
/kaggle/input/textbooks/Pediatrics_Nelson.txt
/kaggle/input/textbooks/Surgery_Schwartz.txt
/kaggle/input/textbooks/Pathoma_Husain.txt
/kaggle/input/textbooks/Obstentrics_Williams.txt
/kaggle/input/textbooks/Pharmacology_Katzung.txt
/kaggle/input/textbooks/Psichiatry_DSM-5.txt
/kaggle/input/textbooks/Anatomy_Gray.txt
/kaggle/input/testjsonl/phrases_no_exclude_test.jsonl


In [3]:
import os

# Directory where the textbooks are stored in .txt format
book_dir = '/kaggle/input/textbooks'

book_texts = []

# Read each textbook from the directory
for book_file in os.listdir(book_dir):
    if book_file.endswith('.txt'):
        with open(os.path.join(book_dir, book_file), 'r', encoding='utf-8') as file:
            book_texts.append(file.read())

# Checking if books are loaded properly
print(f"Loaded {len(book_texts)} books.")


Loaded 18 books.


In [4]:
# Function to split text into sections
def split_into_sections(text, section_length=1000):
    return [text[i:i+section_length] for i in range(0, len(text), section_length)]

# Applying the function to each textbook and flattening the list of sections
book_sections = [split_into_sections(book) for book in book_texts]
book_sections = [section for sublist in book_sections for section in sublist]  # Flatten the nested list

# Check the number of sections
print(f"Total sections created: {len(book_sections)}")


Total sections created: 89142


In [5]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

#Creating a sparse retriever using tf idf weighting systemy
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(book_sections)

#Creating a dense retriever using pre-trained model embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generating embeddings for each section
book_embeddings = model.encode(book_sections)

print(f"Created embeddings of shape: {book_embeddings.shape}")


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2786 [00:00<?, ?it/s]

Created embeddings of shape: (89142, 384)


In [7]:
def hybrid_retrieve(query, alpha=0.7, num_sections=5): 
    query_embedding = model.encode(query)
    dense_scores = util.cos_sim(query_embedding, book_embeddings).flatten()
    sparse_scores = vectorizer.transform([query]).dot(tfidf_matrix.T).toarray().flatten()
    
    # Combining scores
    hybrid_scores = alpha * dense_scores + (1 - alpha) * sparse_scores
    hybrid_scores = hybrid_scores.cpu().numpy()
    
    # Retrieving top N sections based on hybrid scores
    top_indices = hybrid_scores.argsort()[-num_sections:][::-1]  # Get top sections
    top_sections = [book_sections[i] for i in top_indices]
    
    return top_sections



In [13]:
from transformers import pipeline

def answer_question_with_bert(question, alpha=0.5, num_sections=5, max_length=512, stride=128):
    
    # Retrieving relevant sections (from your hybrid retrieval system)
    retrieved_sections = hybrid_retrieve(question, alpha, num_sections=num_sections)
    
    # Combining retrieved sections into a single context
    context = " ".join(retrieved_sections)
    
    answer = qa_pipeline(question=question, context=context, max_length=max_length, stride=stride)
    
    return answer['answer']




In [27]:
# Example question from the dataset
question = "Long bones are tubular or cuboidal?"

answer = answer_question_with_bert(question, alpha=0.7)
print(f"Answer: {answer}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer: tubular


In [15]:
import json

# Load the test JSONL file
test_questions = []
with open('/kaggle/input/testjsonl/phrases_no_exclude_test.jsonl', 'r') as file:
    for line in file:
        test_questions.append(json.loads(line))

# Check the first question
print(test_questions[0])


{'question': 'A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?', 'answer': 'Tell the attending that he cannot fail to disclose this mistake', 'options': {'A': 'Disclose the error to the patient and put it in the operative report', 'B': 'Tell the attending that he cannot fail to disclose this mistake', 'C': 'Report the physician to the ethics committee', 'D': 'Refuse to dictate the operative report'}, 'meta_info': 'step1', 'answer_idx': 

In [19]:
def evaluate_model_on_test(test_questions):
    correct = 0
    total = len(test_questions)
    
    for entry in test_questions:
        question = entry['question']
        options = entry['options'] 
        correct_answer_idx = entry['answer_idx'] 
        
        generated_answer = answer_question_with_bert(question)
        
        # Finding the closest match to the generated answer from the options
        best_choice = find_closest_answer(generated_answer, options)
        
        # Comparing with the correct answer
        if best_choice == correct_answer_idx:
            correct += 1
            
    accuracy = correct / total
    print(f"Model Accuracy: {accuracy * 100:.2f}%")


In [20]:
from difflib import SequenceMatcher

def find_closest_answer(generated_answer, options):
    # Finding the best matching option based on similarity
    best_choice = None
    best_score = 0
    
    for choice, text in options.items():
        score = SequenceMatcher(None, generated_answer, text).ratio()
        if score > best_score:
            best_choice = choice
            best_score = score
    
    return best_choice
