In [None]:
!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install peft
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes
# if not running on Colab ensure transformers is installed too



In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import VectorStoreIndex, Document

In [None]:
# import any embedding model on HF hub
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

Settings.llm = None # we won't use LlamaIndex to set up LLM
Settings.chunk_size = 256
Settings.chunk_overlap = 25

LLM is explicitly disabled. Using MockLLM.


In [None]:
documents = SimpleDirectoryReader("/content/books").load_data()

In [None]:
type(documents[0])

In [None]:
import re

def clean_text_content(raw_text: str) -> list:
    """
    Cleans and structures text from starting pages or general pages.
    Handles metadata, lists, and narrative content dynamically.

    Args:
        raw_text (str): Raw text input from a page.
    Returns:
        list: List of structured segments with metadata, lists, and narrative.
    """
    # Step 1: Remove excessive spaces and line breaks
    cleaned_text = re.sub(r'\s+', ' ', raw_text).strip()

    # Initialize list for structured content
    segments = []

    # Step 2: Detect and extract ISBN
    isbn_match = re.search(r'ISBN[\s:-]*([\d-]+)', cleaned_text, re.IGNORECASE)
    if isbn_match:
        segments.append({"type": "metadata", "content": f"ISBN: {isbn_match.group(1)}"})

    # Step 3: Detect Author or Contributor Name
    author_match = re.search(r'Center[\s\n]*(.*?)\n', cleaned_text, re.IGNORECASE)
    if author_match:
        segments.append({"type": "metadata", "content": f"Author: {author_match.group(1).strip()}"})

    # Step 4: Remove Arabic characters
    cleaned_text = re.sub(r'[\u0600-\u06FF]+', '', cleaned_text)

    # Step 5: Remove irrelevant text patterns
    cleaned_text = re.sub(r"WC\d?[_\w\d]+\.indb", "", cleaned_text)
    cleaned_text = re.sub(r"\b\d{1,2}\b", "", cleaned_text)  # Matches standalone numbers like 1, 16, etc.
    cleaned_text = re.sub(r"(PM|AM|[0-9]{1,2}:[0-9]{2})", "", cleaned_text)  # Matches timestamps
    cleaned_text = re.sub(r"www\.[^\s]+|http[^\s]+", "", cleaned_text)  # Matches URLs
    cleaned_text = re.sub(r'[^\w\s.,:!/\'-]', '', cleaned_text)  # Remove remaining special characters except :,/, and .

    # Step 6: Remove standalone symbols (e.g., `,`, `.` as individual tokens)
    cleaned_text = re.sub(r'^[\.,!?\'-]', '', cleaned_text)  # Removes standalone punctuation marks

    # Step 7: Remove duplicate words
    cleaned_text = re.sub(r'\b(\w+)\s+\1\b', r'\1', cleaned_text)

    # Step 8: Extract bullet points (e.g., lists with symbols)
    bullets = re.findall(r'[\x81•\-]\s*(.*?)\s*(?=[\x81•\-]|$)', cleaned_text)
    for bullet in bullets:
        segments.append({"type": "list_item", "content": bullet.strip()})

    # Step 9: Remove bullets and metadata to isolate narrative content
    cleaned_text_no_bullets = re.sub(r'[\x81•\-].*?\s*(?=[\x81•\-]|$)', '', cleaned_text).strip()
    narrative = re.sub(r'(ISBN[\s:-]*[\d-]+|Center[\s\n]*.*?)', '', cleaned_text_no_bullets, flags=re.IGNORECASE).strip()

    # Step 10: Split long text into sentences for readability
    if narrative:
        sentences = re.split(r'(?<=[.!?])\s+', narrative)
        for sentence in sentences:
            if sentence.strip():
                segments.append({"type": "narrative", "content": sentence.strip()})

    # Step 11: Remove lines that contain only a punctuation mark
    cleaned_text_lines = cleaned_text.split('\n')
    cleaned_text_lines = [line for line in cleaned_text_lines if not re.match(r'^\s*[.,!?\'-]\s*$', line)]
    cleaned_text = '\n'.join(cleaned_text_lines)

    # Update the narrative content
    narrative_lines = narrative.split('\n')
    narrative_lines = [line.lstrip() for line in narrative_lines if not re.match(r'^\s*[.,!?\'-]\s*$', line)]
    narrative = '\n'.join(narrative_lines)

    return segments


def process_documents(documents: list) -> list:
    """
    Cleans and processes a list of Document objects.

    Args:
        documents (list): A list of Document objects.

    Returns:
        list: A list of cleaned Document objects ready for indexing.
    """
    cleaned_documents = []

    for doc in documents:
        # Extract and clean the text content from the Document
        raw_text = doc.text
        cleaned_data = clean_text_content(raw_text)

        # Combine cleaned segments into a single text string
        combined_content = "\n".join(
            segment['content']
            for segment in cleaned_data
            if segment['type'] in ['metadata', 'list_item', 'narrative']
        )

        # Create a new Document object with the cleaned content
        cleaned_documents.append(Document(text=combined_content))

    return cleaned_documents

In [None]:
# Clean and process the loaded documents
cleaned_documents = process_documents(documents)

In [None]:
idx = 300
print(documents[idx].text)
print("*"*20, "\n", cleaned_documents[idx].text)



78
1
Can ostriches run?  YES / NO 
Can they fly? YES / NO
Can kangaroos fly? YES / NO 
Can they jump? YES / NO
Can polar bears swim? YES / NO 
Can they speak? YES / NO
Ostriches can run but they can‘t fly!
Kangaroos _____________ but ____________________
Polar bears ____________ but ____________________
Rhythms and Listening
Listen, chant and match.
Which animal is it?Which animal is it?
Its jaws are huge.Its jaws are huge.
They crush and chew.They crush and chew.  
It can walk and swimIt can walk and swim
It can hunt for food!It can hunt for food!
Their skin is green,Their skin is green,
Their eyes pop out. Their eyes pop out. 
They can jump and dive,They can jump and dive,
They are loud and fun.They are loud and fun.
CD2 49
CD2 50
2 Listen, point, and circle YES or NO. Practice in pairs.
Birds of a feather flock together.
Advice TimeAdvice Time
WC2_2022_COMBO.indb   78WC2_2022_COMBO.indb   78 5/4/22   11:36 PM5/4/22   11:36 PM
رابط الدر
www.ien.edu.sa
******************** 
 Can ostri

In [None]:
re.sub(r'^[.,!?\'-]', '', ". ")


' '

In [None]:
# Create the index with the cleaned documents
index = VectorStoreIndex.from_documents(cleaned_documents)

Some nodes are missing content, skipping them...


In [None]:
# set number of docs to retreive
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.3)],
)

In [None]:
query = "how the family was travelling to Aqaba?"
response = query_engine.query(query)

# reformat response
context = "Context:\n"
for i in range(top_k):
  try:
    context = context + response.source_nodes[i].text + "\n\n"
  except:
    continue


print(context)

Context:
The family is travelling to Aqaba by car / plane.
There is / isnt a waterfall at Hammamat Main.
The water is warm / cold.
Kareem and Samira are swimming in the water / reading about the waterfall.
Kareem and Samira are looking at / sending a photo.
Are you playing the piano Are you writing an email No, Im not.
Yes, I am!
Are you playing the piano  Are you writing an email  No, Im not.
Yes, I am!
Read and choose  Dear Laila and Ali, Were going to Aqaba.
Dad is driving.
We are driving through the mountains.
We are visiting Hammamat Main.
There is a very big waterfall.
The water is very warm.
The sky is blue.
We are swimming and playing in the water in Hammamat Main.
Here is a photo.
Samira and Kareem The family is travelling to Aqaba by car.

Listen, order and say  Ask and answer  b a c d  drove  bought  went  pray  found  had  sent  wrote What did you do at the weekend I went to the mosque with my father.
We prayed.
They went to Madaba.

2020 Dear Grandma and Grandpa, Were in A

In [None]:
# load fine-tuned model from hub
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

config = PeftConfig.from_pretrained("shawhin/shawgpt-ft")
model = PeftModel.from_pretrained(model, "shawhin/shawgpt-ft")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Mistral-7B-Instruct-v0.2-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_pr

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
# prompt (with context)
prompt_template_w_context = lambda context, comment: f"""ClasseraGPT, a chatbot that answers students' questions based on their grade and the \
relevant books.communicates in clear, easy language, answer is short and brief.

{context}
Please respond to the following comment. Use the context above if it is helpful.

{comment}
[/INST]
"""

comment = "what was Samira told when she entered the library?"

prompt = prompt_template_w_context(context, comment)

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=280)

print(tokenizer.batch_decode(outputs)[0])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> ClasseraGPT, a chatbot that answers students' questions based on their grade and the relevant books.communicates in clear, easy language, answer is short and brief.

Context:
The family is travelling to Aqaba by car / plane.
There is / isnt a waterfall at Hammamat Main.
The water is warm / cold.
Kareem and Samira are swimming in the water / reading about the waterfall.
Kareem and Samira are looking at / sending a photo.
Are you playing the piano Are you writing an email No, Im not.
Yes, I am!
Are you playing the piano  Are you writing an email  No, Im not.
Yes, I am!
Read and choose  Dear Laila and Ali, Were going to Aqaba.
Dad is driving.
We are driving through the mountains.
We are visiting Hammamat Main.
There is a very big waterfall.
The water is very warm.
The sky is blue.
We are swimming and playing in the water in Hammamat Main.
Here is a photo.
Samira and Kareem The family is travelling to Aqaba by car.

Listen, order and say  Ask and answer  b a c d  drove  bought  went  p