In [2]:
import os
import pandas as pd
from langchain.schema import HumanMessage

In [3]:

def load_data_from_files(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    all_data = pd.DataFrame()
    
    # Load each CSV file and concatenate
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        paper_data = pd.read_csv(file_path)
        all_data = pd.concat([all_data, paper_data], ignore_index=True)
        all_data = all_data.dropna()
    return all_data
# Load all data from input_data folder
data_folder = 'input_data'
data = load_data_from_files(data_folder)

# Sample questions for each subject
biology_questions = data[data['Subject'] == 'Biology'].sample(n=5)


In [4]:
def generate_related_mcq(llm, question, vector_store, k=1):
    """
    Generates a new MCQ based on the semantic search result.
    
    Parameters:
    - llm: The language model instance (e.g., ChatGroq).
    - question: The input question to search related content.
    - vector_store: The vector store for semantic search.
    - k: Number of relevant chunks to retrieve.

    Returns:
    - A new MCQ generated by the language model.
    """
    # Perform semantic search to get relevant chunks
    search_results = vector_store.similarity_search(question, k=k)
    
    # Extract the most relevant chunk's content (can concatenate if k > 1)
    response_chunk = "\n".join([doc.page_content for doc in search_results])
    
    print('=' * 10)  # For debugging
    print(response_chunk)  # Print the retrieved chunk
    print('=' * 10)

    # Formulate the prompt to generate a new MCQ
    prompt = f"""
    Based on the following information: {response_chunk}, generate a new multiple-choice question related to the topic in the following JSON format:
    
    {{
        "question": "Your new question here",
        "options": [
            "Option A",
            "Option B",
            "Option C",
            "Option D"
        ],
        "answer": "A"  # Correct option as one of A, B, C, or D
    }}
    Ensure the question and options are relevant to the topic provided.
    """

    # Use the LLM to generate the new MCQ
    new_mcq_response = llm.invoke([HumanMessage(content=prompt)])

    return new_mcq_response.content

In [5]:
import os
import glob
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma  # Or use FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings

# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key:
    raise ValueError("GROQ API key not found in .env file")

os.environ["GROQ_API_KEY"] = groq_api_key

# Set base directory and folder containing the text files
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "ocrbooks")

# Load all text files from the folder
def load_files_from_directory(directory):
    text_files = glob.glob(os.path.join(directory, "*.txt"))
    docs = []
    for file in text_files:
        try:
            with open(file, "r", encoding="utf-8") as f:
                text = f.read()
        except UnicodeDecodeError:
            with open(file, "r", encoding="ISO-8859-1") as f:  # Fallback to ISO-8859-1
                text = f.read()
        docs.append(text)
    return docs


# Load and read all text files in the ocrbooks folder
documents = load_files_from_directory(DATA_DIR)

# Split documents into semantic chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust chunk size based on document size
    chunk_overlap=50,  # Overlap for context continuity
)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
chunks = []
for doc in documents:
    chunks.extend(text_splitter.split_text(doc))


# Create ChromaDB vector store with the chunks
persist_directory = "bio_embeddings"

# # Load or Create Chroma Vector Store
if not os.path.exists(persist_directory):
    vector_store = Chroma.from_texts(chunks, embedding_model, persist_directory=persist_directory)
else:
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.7)
# Create a RetrievalQA chain using a language model and vector store
retrieval_chain = RetrievalQA.from_chain_type(
    llm= llm, 
    retriever=vector_store.as_retriever(),
    chain_type="stuff"
)

new_mcqs = []

# Iterate through the biology_questions DataFrame
for index, row in biology_questions.iterrows():
    question = row["Question"]
    new_mcq = generate_related_mcq(llm, question, vector_store)
    new_mcqs.append(new_mcq)

# Assuming you want to store new MCQs in a DataFrame
new_mcqs_df = pd.DataFrame(new_mcqs, columns=["New MCQ"])



  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
  vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


These are called X - and - Y linked genes. These are also called pseudoautosomal\ngenes because their pattern of inheritance is like autosomal genes.\n\n22. Variation And Genetics eLearn.Punjab\n\nSex - Linkage in Humans\n\nHumans have many X-linked traits of which some like haemophilia and colour blindness\nare recessive while others like hypophosphatemic or vitamin D resistant rickets are\ndominant. X - linked dominant is a trait which is determined by an X linked dominant\ngene, while X - linked recessive is a trait that is determined by an X - linked recessive\ngene. Their patterns of inheritance are very different from each other.\n\nMarriage line\n\n1 2\nX\"Y (Father)\nSibship line\n\n(Mother) ¥ x\"\nMarriage line\n\n' Children\n\n1 2\n(Daughter's husband) (Carrier daughter)\nX\"y xx:\n\nGrandchildren\n1 2 3 4\n\nx x® xx\" xy xty tl\na Normal male [ | Affected male\n\nGrand Grand Grand Grand\ndaughter d t\nughter daughter aire sem @ Normal female (e) Carrier female\n\nFig 22.27
A

In [9]:
new_mcqs

['{\n    "question": "What type of inheritance pattern is observed in X-linked dominant and X-linked recessive genes?",\n    "options": [\n        "Autosomal dominant",\n        "Autosomal recessive",\n        "Similar to autosomal genes, but with pseudoautosomal inheritance",\n        "Mitochondrial inheritance"\n    ],\n    "answer": "C"\n}',
 '{\n    "question": "What is the primary difference between competitive and non-competitive inhibitors?",\n    "options": [\n        "Competitive inhibitors bind to the active site, while non-competitive inhibitors bind to a different site on the enzyme.",\n        "Competitive inhibitors change the enzyme\'s structure, while non-competitive inhibitors do not.",\n        "Competitive inhibitors prevent the formation of enzyme-substrate complexes, while non-competitive inhibitors allow them to form but prevent catalysis.",\n        "Competitive inhibitors are reversible, while non-competitive inhibitors are irreversible."\n    ],\n    "answer": 

In [8]:
print(new_mcqs_df.iloc[0,0])

{
    "question": "What type of inheritance pattern is observed in X-linked dominant and X-linked recessive genes?",
    "options": [
        "Autosomal dominant",
        "Autosomal recessive",
        "Similar to autosomal genes, but with pseudoautosomal inheritance",
        "Mitochondrial inheritance"
    ],
    "answer": "C"
}


In [10]:
import json
for i in range(5):
    # Parse the JSON string into a dictionary
    new_mcq = json.loads(new_mcqs[i])
    print(new_mcq)

{'question': 'What type of inheritance pattern is observed in X-linked dominant and X-linked recessive genes?', 'options': ['Autosomal dominant', 'Autosomal recessive', 'Similar to autosomal genes, but with pseudoautosomal inheritance', 'Mitochondrial inheritance'], 'answer': 'C'}
{'question': 'What is the primary difference between competitive and non-competitive inhibitors?', 'options': ['Competitive inhibitors bind to the active site, while non-competitive inhibitors bind to a different site on the enzyme.', "Competitive inhibitors change the enzyme's structure, while non-competitive inhibitors do not.", 'Competitive inhibitors prevent the formation of enzyme-substrate complexes, while non-competitive inhibitors allow them to form but prevent catalysis.', 'Competitive inhibitors are reversible, while non-competitive inhibitors are irreversible.'], 'answer': 'A'}
{'question': 'What is the main advantage of in vitro fertilization?', 'options': ['It increases the risk of physical abnor