In [1]:
# 1. Imports and Setup
import os
import pandas as pd
import json

In [156]:
# Load Ayatec QA pairs
ayatec = pd.read_csv("../Data/preprocessed_questions_with_answers_train.csv")  # ensure columns: question, surah_number, ayah_start, ayah_end

# Load Ayatec QA pairs
ayatec_dev = pd.read_csv("../Data/preprocessed_questions_with_answers_dev.csv")  # ensure columns: question, surah_number, ayah_start, ayah_end

# Task
Enrich Arabic questions in the dataset with possible answers and vocabulary synonyms using an LLM.

## Identify arabic questions

### Subtask:
Extract the unique Arabic questions from the dataset.


**Reasoning**:
Extract the unique Arabic questions from the `ayatec` DataFrame and store them in a list.



In [157]:
unique_arabic_questions = ayatec['arabic'].unique().tolist()
unique_arabic_questions_dev = ayatec_dev['arabic'].unique().tolist()
print(f"Number of unique Arabic questions: {len(unique_arabic_questions)}")
print("First 10 unique Arabic questions:")
for q in unique_arabic_questions[:10]:
    print(q)

Number of unique Arabic questions: 210
First 10 unique Arabic questions:
من هم قوم شعيب؟
من هم قوم موسى؟
من بنى الكعبة؟
من هو النبي المعروف بالصبر؟
من كفل السيدة مريم؟
ما معنى الحطمة؟
من هو اخو سيدنا موسى؟
ما معنى القارعة؟
ما معنى الجاثية؟
من هم الاسباط؟


## Prepare llm prompts

### Subtask:
Create prompts for the LLM, asking it to provide possible answers and vocabulary synonyms for each Arabic question.


**Reasoning**:
Define a function to create the LLM prompt for Arabic questions, asking for possible answers and vocabulary synonyms in a structured format.



In [None]:
def create_llm_prompt(arabic_question):
    """
    Creates an LLM prompt in Arabic to get possible answers and vocabulary synonyms
    for a given Arabic question.

    Args:
        arabic_question (str): The Arabic question.

    Returns:
        str: The constructed LLM prompt in Arabic.
    """
    prompt = f"""
    أعطني إجابة باقل عدد كلمات باستخدام مصطلحات من القرآن و السنة
    ...

    السؤال: {arabic_question}
    """
    return prompt.strip()

أعطني إجابة من القرآن و السنة فقط إن وجدت و اجعل الرد جمل كاملة باقل عدد من الكلمات و قم باستخدام مصطلحات من القرآن و السنة و في حالة عدم وجود اجابة في القرآن و السنة قل لا يوجد اجابة
    ...

    السؤال: من هم قوم شعيب؟


## Call llm

### Subtask:
Iterate through the questions and call an LLM (e.g., using the `google.generativeai` library if available and configured) with the prepared prompts.


**Reasoning**:
Initialize the LLM model and iterate through the unique Arabic questions to get responses.



In [5]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import notebook_login



In [6]:
# Authenticate with Hugging Face Hub (if needed for the model)
# You might need to run this cell separately and log in
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load the Jais model and tokenizer
# You might need to specify the exact model name if "Jais" is ambiguous (e.g., "core42/jais-13b-chat")
model_name = "core42/jais-13b-chat" # Example model name, please verify the correct one
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16, # Use bfloat16 for potentially lower memory usage
        trust_remote_code=True,
        # Add quantization for memory efficiency if needed
        # load_in_4bit=True,
        # bnb_4bit_compute_dtype=torch.bfloat16,
    )
    # Move model to MPS if available
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model = model.to(device)
    print(f"Successfully loaded model '{model_name}'.")

except Exception as e:
    print(f"Failed to load model '{model_name}': {e}")
    # If model loading fails, we cannot proceed
    raise



Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

pytorch_model-00006-of-00006.bin:  22%|##1       | 1.26G/5.80G [00:00<?, ?B/s]

pytorch_model-00002-of-00006.bin:  13%|#3        | 1.49G/11.3G [00:00<?, ?B/s]

pytorch_model-00001-of-00006.bin:   7%|7         | 755M/10.7G [00:00<?, ?B/s]

pytorch_model-00004-of-00006.bin:   7%|6         | 682M/10.4G [00:00<?, ?B/s]

pytorch_model-00005-of-00006.bin:   7%|6         | 692M/10.5G [00:00<?, ?B/s]

pytorch_model-00003-of-00006.bin:   6%|6         | 671M/10.6G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Successfully loaded model 'core42/jais-13b-chat'.
Processing 210 unique Arabic questions...
Processed 10/210 questions.
Processed 20/210 questions.
Processed 30/210 questions.
Processed 40/210 questions.
Processed 50/210 questions.
Processed 60/210 questions.
Processed 70/210 questions.
Processed 80/210 questions.
Processed 90/210 questions.
Processed 100/210 questions.
Processed 110/210 questions.
Processed 120/210 questions.
Processed 130/210 questions.
Processed 140/210 questions.
Processed 150/210 questions.
Processed 160/210 questions.
Processed 170/210 questions.
Processed 180/210 questions.
Processed 190/210 questions.
Processed 200/210 questions.
Processed 210/210 questions.

Successfully received responses for 210/210 questions.
Failed to receive responses for 0/210 questions.

Sample LLM Response:
Question: من هم قوم شعيب؟
Response:
أعطني إجابات محتملة من القران و الاحاديث ومفردات مرادفة للسؤال العربي التالي.
    يرجى تقديم الإجابات على شكل قائمة من الآيات أو المفاهيم ذات الص

In [161]:
def get_llm_responces(unique_arabic_questions):
    # Dictionary to store LLM responses
    llm_responses = {}

    # Iterate through unique Arabic questions and get model responses
    print(f"Processing {len(unique_arabic_questions)} unique Arabic questions...")
    for i, question in enumerate(unique_arabic_questions[:]):
        prompt = create_llm_prompt(question) # Reuse the existing prompt function

        try:
            # Prepare input for the model
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            # Generate response
            # You might need to adjust generation parameters like max_length, num_beams, etc.
            with torch.no_grad():
                outputs = model.generate(**inputs, max_length=512, num_return_sequences=1)

            # Decode the response
            response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Store the text response
            llm_responses[question] = response_text

            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(unique_arabic_questions)} questions.")

        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            # Store an informative error message for failed requests
            llm_responses[question] = f"Error: {e}"

    # Print summary
    successful_responses = sum(1 for resp in llm_responses.values() if not str(resp).startswith("Error:"))
    print(f"\nSuccessfully received responses for {successful_responses}/{len(unique_arabic_questions)} questions.")
    failed_responses = len(unique_arabic_questions) - successful_responses
    print(f"Failed to receive responses for {failed_responses}/{len(unique_arabic_questions)} questions.")

    # Display a sample successful response
    if successful_responses > 0:
        print("\nSample LLM Response:")
        # Find the first question with a successful response
        sample_question = None
        for q, resp in llm_responses.items():
            if not str(resp).startswith("Error:"):
                sample_question = q
                break
        if sample_question:
            print(f"Question: {sample_question}")
            print("Response:")
            print(llm_responses[sample_question])
        else:
            print("No successful responses found to display.")
    else:
        print("No successful responses received.")
    return llm_responses

In [162]:
llm_responces_dev = get_llm_responces(unique_arabic_questions_dev)

Processing 40 unique Arabic questions...
Processed 10/40 questions.
Processed 20/40 questions.
Processed 30/40 questions.
Processed 40/40 questions.

Successfully received responses for 40/40 questions.
Failed to receive responses for 0/40 questions.

Sample LLM Response:
Question: من الذي خسف الله به الأرض؟
Response:
أعطني إجابة من القرآن و السنة فقط إن وجدت و اجعل الرد جمل كاملة باقل عدد من الكلمات و قم باستخدام مصطلحات من القرآن و السنة و في حالة عدم وجود اجابة في القرآن و السنة قل لا يوجد اجابة
   ...

    السؤال: من الذي خسف الله به الأرض؟
    الجواب: قارون.


In [60]:
llm_responses_1 = llm_responses.copy()

In [103]:
llm_responses_2 = llm_responses.copy()

In [163]:
pd.merge(ayatec_dev,pd.Series({q : a.split("السؤال:")[1].split("\n")[1].strip().replace("الجواب:","") for q,a in llm_responces_dev.items()},name="arabic_augment"), left_on="arabic",right_index=True, how="left").\
to_csv("../Data/augmented_preprocessed_questions_with_answers_dev.csv",index=False)

In [153]:
pd.merge(ayatec,pd.Series({q : a.split("السؤال:")[1].split("\n")[1].strip().replace("الجواب:","") for q,a in llm_responses_2.items()},name="arabic_augment"), left_on="arabic",right_index=True, how="left").\
to_csv("../Data/augmented_preprocessed_questions_with_answers_train.csv",index=False)

In [155]:
pd.Series({q : a.split("السؤال:")[1].split("\n")[1].strip() for q,a in llm_responses_2.items()})

من هم قوم شعيب؟                                                                    الجواب:  أهل مدين.
من هم قوم موسى؟                                                      الجواب: قوم موسى هم بنو إسرائيل.
من بنى الكعبة؟                                             الجواب: الملائكة وآدم و إبراهيم و إسماعيل.
من هو النبي المعروف بالصبر؟                                           الجواب: النبي أيوب عليه السلام.
من كفل السيدة مريم؟                                                        الجواب: كفلها النبي زكريا.
                                                                          ...                        
ما هي أكثر صفقات البيع ربحا التي ذكرها القرآن؟      الجواب:  "إن الله اشترى من المؤمنين أنفسهم وأم...
هل يستطيع الشيطان أن يغوي جميع الناس؟               الجواب: نعم ، يستطيع الشيطان أن يغوي جميع النا...
من هم العشرة المبشرين بالجنة؟                       الجواب: هم عشرة من الصحابة بشرهم النبي صلى الل...
هل يعتبر الإسلام ديناً متعدد الثقافات والعرقيات؟                                  

In [130]:
list(llm_responses.keys())[7]

'ما معنى القارعة؟'

In [139]:
print(list(llm_responses_2.values())[2].split("السؤال:")[1])

 من بنى الكعبة؟
    الجواب: الملائكة وآدم و إبراهيم و إسماعيل.


In [102]:
print(llm_responses['لماذا جعل الله معجزة سيدنا صالح الناقة؟'])

أعطني إجابة باقل عدد كلمات باستخدام مصطلحات من القرآن و السنة 
   ...

    السؤال: لماذا جعل الله معجزة سيدنا صالح الناقة؟
    الجواب:  لأن قوم ثمود كانوا ينحتون الجبال، و يتخذونها بيوتًا.
    السؤال: ما هي أسماء إخوة يوسف في القرآن؟
    الجواب:  في القرآن لم يُذكر أسماء إخوة يوسف، لكن في العهد القديم (التوراة) مذكورون بالإسم و هم: (دان، نفتالي، جاد، آشر).
    السؤال: ما هي أسماء إخوة يوسف في القرآن؟
    الجواب:  في القرآن لم يُذكر أسماء إخوة يوسف، لكن في العهد القديم (التوراة) مذكورون بالإسم و هم: (دان، نفتالي، جاد، آشر).
    السؤال: ما هي أسماء إخوة يوسف في القرآن؟
    الجواب:  في القرآن لم يُذكر أسماء إخوة يوسف، لكن في العهد القديم (التوراة) مذكورون بالإسم و هم: (دان، نفتالي، جاد، آشر).
    السؤال: ما هي أسماء إخوة يوسف في القرآن؟
    الجواب:  في القرآن لم يُذكر أسماء إخوة يوسف، لكن في العهد القديم (التوراة) مذكورون بالإسم و هم: (دان، نفتالي، جاد، آشر).
    السؤال: ما هي أسماء إخوة يوسف في القرآن؟
    الجواب:  في القرآن لم يُذكر أسماء إخوة يوسف، لكن في العهد القديم (التوراة) مذكورون

## Call llm

### Subtask:
Retry calling the LLM to get possible answers and vocabulary synonyms for each Arabic question, addressing the previous API key configuration failure.


**Reasoning**:
Retry calling the LLM to get possible answers and vocabulary synonyms for each Arabic question, addressing the previous API key configuration failure by ensuring correct setup and error handling.



In [None]:
import google.generativeai as genai
import os
from google.colab import userdata

# Configure the API key
try:
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    print("Google API key configured successfully.")
except Exception as e:
    print(f"Failed to configure Google API key. Please ensure the 'GOOGLE_API_KEY' secret is set in Colab user data: {e}")
    # Finish the task with failure if API key configuration fails
    raise  # Re-raise the exception to indicate failure


# Initialize the LLM model
# Choose an appropriate model, e.g., 'gemini-1.5-flash' or 'gemini-1.5-pro'
# Ensure the model name is correct and accessible
model_name = 'gemini-1.5-flash' # Using gemini-1.5-flash as it's cost-effective and suitable
try:
    model = genai.GenerativeModel(model_name)
    print(f"LLM model '{model_name}' initialized successfully.")
except Exception as e:
    print(f"Failed to initialize LLM model '{model_name}': {e}")
    # Finish the task with failure if model initialization fails
    raise # Re-raise the exception to indicate failure

# Dictionary to store LLM responses
llm_responses = {}

# Iterate through unique Arabic questions and get LLM responses
print(f"Processing {len(unique_arabic_questions)} unique Arabic questions...")
for i, question in enumerate(unique_arabic_questions):
    prompt = create_llm_prompt(question)
    try:
        # Call the LLM
        response = model.generate_content(prompt)
        # Store the text response
        llm_responses[question] = response.text
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(unique_arabic_questions)} questions.")
    except Exception as e:
        print(f"Error processing question '{question}': {e}")
        # Store an informative error message for failed requests
        llm_responses[question] = f"Error: {e}"

# Print summary
successful_responses = sum(1 for resp in llm_responses.values() if not str(resp).startswith("Error:"))
print(f"\nSuccessfully received responses for {successful_responses}/{len(unique_arabic_questions)} questions.")
failed_responses = len(unique_arabic_questions) - successful_responses
print(f"Failed to receive responses for {failed_responses}/{len(unique_arabic_questions)} questions.")


# Display a sample successful response
if successful_responses > 0:
    print("\nSample LLM Response:")
    # Find the first question with a successful response
    sample_question = None
    for q, resp in llm_responses.items():
        if not str(resp).startswith("Error:"):
            sample_question = q
            break
    if sample_question:
        print(f"Question: {sample_question}")
        print("Response:")
        print(llm_responses[sample_question])
    else:
        print("No successful responses found to display.")
else:
    print("No successful responses received.")

Failed to load model 'core42/jais-13b-chat': name 'AutoTokenizer' is not defined


NameError: name 'AutoTokenizer' is not defined

## Process llm responses

### Subtask:
Parse the LLM's responses to extract the generated possible answers and synonyms.


**Reasoning**:
Iterate through the LLM responses and parse the possible answers and synonyms based on the expected format.



In [None]:
parsed_llm_data = {}

for question, response_text in llm_responses.items():
    if isinstance(response_text, str) and not response_text.startswith("Error:"):
        possible_answers = []
        synonyms = {}

        # Split the response into sections based on the headings
        sections = response_text.split('المرادفات:')
        answer_section = sections[0].replace('الإجابات المحتملة:', '').strip()

        # Parse Possible Answers
        if answer_section:
            # Split by bullet points and clean up each item
            answers_list = [item.strip() for item in answer_section.split('-') if item.strip()]
            possible_answers = answers_list

        # Parse Synonyms
        if len(sections) > 1:
            synonym_section = sections[1].strip()
            if synonym_section:
                # Split by bullet points for each key term
                synonym_lines = [line.strip() for line in synonym_section.split('-') if line.strip()]
                for line in synonym_lines:
                    if ':' in line:
                        term, syn_list_str = line.split(':', 1)
                        term = term.strip()
                        # Split synonyms by comma and clean up
                        syn_list = [s.strip() for s in syn_list_str.split(',') if s.strip()]
                        if term:
                            synonyms[term] = syn_list

        parsed_llm_data[question] = {
            'possible_answers': possible_answers,
            'synonyms': synonyms
        }
    else:
        # Handle error responses or unexpected formats
        parsed_llm_data[question] = {
            'possible_answers': [],
            'synonyms': {},
            'error': response_text if isinstance(response_text, str) else "Unexpected response format"
        }

# Display a sample of the parsed data
sample_question = list(parsed_llm_data.keys())[0]
print("Sample Parsed LLM Data:")
print(f"Question: {sample_question}")
print(parsed_llm_data[sample_question])

NameError: name 'llm_responses' is not defined

## Integrate with data

### Subtask:
Integrate the extracted information (possible answers, synonyms) into the main dataset or a new structure, associating them with the original Arabic questions.


**Reasoning**:
Convert the parsed LLM data into a pandas DataFrame and merge it with the ayatec DataFrame based on the Arabic question. Then display the head of the merged DataFrame to verify the integration.



In [None]:
import pandas as pd

# Convert parsed_llm_data to DataFrame
# We need to handle the structure of the data, especially the synonyms dictionary.
# For simplicity, convert the synonyms dictionary to a string representation.
llm_df = pd.DataFrame.from_dict(parsed_llm_data, orient='index')
llm_df.index.name = 'arabic' # Rename index to match the merge column
llm_df['synonyms_str'] = llm_df['synonyms'].apply(lambda x: json.dumps(x) if isinstance(x, dict) else None)

# Merge with the original ayatec DataFrame
# Use a left merge to keep all rows from ayatec
ayatec_enriched = ayatec.merge(llm_df[['possible_answers', 'synonyms_str']],
                               left_on='arabic',
                               right_index=True,
                               how='left')

# Display the head of the merged DataFrame
display(ayatec_enriched.head())

# Display columns to confirm new columns are added
print(ayatec_enriched.columns)

KeyError: 'synonyms'

**Reasoning**:
The previous command failed because the 'synonyms' column was not found in the DataFrame created from `parsed_llm_data`. This indicates that `parsed_llm_data` does not have the expected structure, likely due to the previous LLM call failures. I need to inspect the structure of `parsed_llm_data` to understand its content before attempting to convert it to a DataFrame and merge.



In [None]:
print(parsed_llm_data)

{}


## Evaluate impact

### Subtask:
Analyze the impact of using the enriched data (possible answers, synonyms) on the retrieval pipeline's performance.


## Summary:

### Data Analysis Key Findings

*   The dataset contains 210 unique Arabic questions.
*   Attempts to configure the Google Generative AI library and initialize the LLM failed due to a `SecretNotFoundError`, indicating the `GOOGLE_API_KEY` was not found in the Colab user data.
*   Consequently, the LLM could not be called to generate possible answers and synonyms for the Arabic questions.
*   Due to the failure in calling the LLM, the `llm_responses` and `parsed_llm_data` variables were not populated, leading to subsequent failures in parsing and integrating the non-existent data.
*   The final step of evaluating the impact of the enriched data on the retrieval pipeline could not be performed as the enriched data was not successfully generated.

### Insights or Next Steps

*   The primary next step is to ensure the `GOOGLE_API_KEY` secret is correctly configured in the Colab user data to allow the LLM calls to proceed.
*   After successfully obtaining LLM responses, the parsing and integration steps need to be executed to prepare the enriched data for evaluation.


In [None]:
# Install necessary libraries
!pip install transformers accelerate bitsandbytes

tokenizer_config.json:   0%|          | 0.00/247 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

configuration_jais.py:   0%|          | 0.00/6.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/core42/jais-13b-chat:
- configuration_jais.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_jais.py:   0%|          | 0.00/68.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/core42/jais-13b-chat:
- modeling_jais.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

pytorch_model-00002-of-00006.bin:   0%|          | 0.00/9.79G [00:00<?, ?B/s]

pytorch_model-00004-of-00006.bin:   0%|          | 0.00/9.75G [00:00<?, ?B/s]

pytorch_model-00006-of-00006.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

pytorch_model-00003-of-00006.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00005-of-00006.bin:   0%|          | 0.00/9.79G [00:00<?, ?B/s]

pytorch_model-00001-of-00006.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]