In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os

# Directory where the textbooks are stored in .txt format
book_dir = '/kaggle/input/textbooks'

book_texts = []

# Read each textbook from the directory
for book_file in os.listdir(book_dir):
    if book_file.endswith('.txt'):
        with open(os.path.join(book_dir, book_file), 'r', encoding='utf-8') as file:
            book_texts.append(file.read())

# Checking if books are loaded properly
print(f"Loaded {len(book_texts)} books.")


In [None]:
# Function to split text into sections
def split_into_sections(text, section_length=1000):
    return [text[i:i+section_length] for i in range(0, len(text), section_length)]

# Applying the function to each textbook and flattening the list of sections
book_sections = [split_into_sections(book) for book in book_texts]
book_sections = [section for sublist in book_sections for section in sublist]  # Flatten the nested list

# Check the number of sections
print(f"Total sections created: {len(book_sections)}")


In [None]:
!pip install sentence_transformers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

#Creating a sparse retriever using tf idf weighting systemy
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(book_sections)

#Creating a dense retriever using pre-trained model embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generating embeddings for each section
book_embeddings = model.encode(book_sections)

print(f"Created embeddings of shape: {book_embeddings.shape}")


In [None]:
def hybrid_retrieve(query, alpha=0.7, num_sections=5): 
    query_embedding = model.encode(query)
    dense_scores = util.cos_sim(query_embedding, book_embeddings).flatten()
    sparse_scores = vectorizer.transform([query]).dot(tfidf_matrix.T).toarray().flatten()
    
    # Combining scores
    hybrid_scores = alpha * dense_scores + (1 - alpha) * sparse_scores
    hybrid_scores = hybrid_scores.cpu().numpy()
    
    # Retrieving top N sections based on hybrid scores
    top_indices = hybrid_scores.argsort()[-num_sections:][::-1]  # Get top sections
    top_sections = [book_sections[i] for i in top_indices]
    
    return top_sections



In [None]:
from transformers import pipeline

# Loading the pre-trained BERT model for Question Answering
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Example question
question = "What is the treatment for hypertension?"
context = "Hypertension treatment includes lifestyle changes, such as diet modification, and medications to lower blood pressure."

# Getting the answer using the BERT-based model
result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}")


In [None]:
from transformers import pipeline

def answer_question_with_bert(question, alpha=0.5, num_sections=5, max_length=512, stride=128):
    
    # Retrieving relevant sections (from your hybrid retrieval system)
    retrieved_sections = hybrid_retrieve(question, alpha, num_sections=num_sections)
    
    # Combining retrieved sections into a single context
    context = " ".join(retrieved_sections)
    
    answer = qa_pipeline(question=question, context=context, max_length=max_length, stride=stride)
    
    return answer['answer']




In [None]:
# Example question from the dataset
question = "What is the treatment for hypertension?"

answer = answer_question_with_bert(question, alpha=0.7)
print(f"Answer: {answer}")


In [None]:
import json

# Load the test JSONL file
test_questions = []
with open('/kaggle/input/testjsonl/phrases_no_exclude_test.jsonl', 'r') as file:
    for line in file:
        test_questions.append(json.loads(line))

# Check the first question
print(test_questions[0])


In [None]:
def evaluate_model_on_test(test_questions):
    correct = 0
    total = len(test_questions)
    
    for entry in test_questions:
        question = entry['question']
        options = entry['options'] 
        correct_answer_idx = entry['answer_idx'] 
        
        generated_answer = answer_question(question)
        
        # Finding the closest match to the generated answer from the options
        best_choice = find_closest_answer(generated_answer, options)
        
        # Comparing with the correct answer
        if best_choice == correct_answer_idx:
            correct += 1
            
    accuracy = correct / total
    print(f"Model Accuracy: {accuracy * 100:.2f}%")


In [None]:
from difflib import SequenceMatcher

def find_closest_answer(generated_answer, options):
    # Finding the best matching option based on similarity
    best_choice = None
    best_score = 0
    
    for choice, text in options.items():
        score = SequenceMatcher(None, generated_answer, text).ratio()
        if score > best_score:
            best_choice = choice
            best_score = score
    
    return best_choice


In [None]:
# Evaluating the model on the test set
evaluate_model_on_test(test_questions)
