# RAG system

## Packages

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
%%capture
!pip install accelerate

In [None]:
%%capture
!pip install -U bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

## Import data

### For Github, run this !

In [None]:
%%capture
!pip install transformers sentence-transformers PyPDF2 faiss-cpu

In [None]:
def read_text_file(text_file_path):
    with open(text_file_path, "r") as file:
        text = file.read()
    return text

# Define the path to the text file
text_file_path = "/content/extracted_text.txt"

# Read the content from the text file
cleaned_text = read_text_file(text_file_path)

In [None]:
# Chucks
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

chunks = chunk_text(cleaned_text)

In [None]:
len(chunks) # Checking, should be 630

### For local implementation

In [None]:
'''
import PyPDF2

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text
'''

In [None]:
'''
pdf_file_path = "/content/combinepdf.pdf"
pdf_text = extract_text_from_pdf(pdf_file_path)
'''

In [None]:
'''
import re

# Define the pattern using regular expressions
# '.*?' is a non-greedy match for any character (except newlines), between "Release notes" and "Style Manual"
pattern = r"Release notes.*?Style Manual"

# Use re.sub() to replace the matched pattern with an empty string
cleaned_text = re.sub(pattern, '', pdf_text, flags=re.DOTALL)

# print("Cleaned Text:")
# print(cleaned_text)
'''

## Sentence Embeddings for Retrieval

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a sentence transformer model for creating embeddings
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for all text chunks
chunk_embeddings = embedding_model.encode(chunks)

# Use FAISS to create an index for efficient similarity search
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
index.add(np.array(chunk_embeddings))

# Function to retrieve the most relevant chunk based on the query
def retrieve_relevant_chunk(query, top_k=1):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [chunks[idx] for idx in indices[0]]

## Gemma2 2B

In [None]:
# Define the RAG function that generates an answer
def generate_answer(user_input):
    # Define a specific prompt to be added to the user's input
    custom_prompt = ''' Please rewrite the following text according to the Australian Government writing style, according to the Style Manuel, including the tone, word choice, spelling, inclusive language:\n'''

    # Combine the custom prompt with the user writing
    query = custom_prompt + user_input

    # Retrieve the most relevant chunk based on the query (this is assumed to be already defined)
    relevant_chunk = retrieve_relevant_chunk(query)[0]

    # Combine the retrieved text with the query
    input_text = relevant_chunk + "\n\nUser: " + query + "\nAnswer:"

    # Tokenize the input text and move tensors to the same device as the model
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)

    # Generate the answer using the model
    outputs = model.generate(**inputs, max_length=2048, num_return_sequences=1)

    # Decode the generated answer
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the portion after 'Answer:'
    answer_start = full_output.find("Answer:")
    if answer_start != -1:
        answer = full_output[answer_start + len("Answer:"):].strip()
    else:
        answer = full_output  # If 'Answer:' is not found, return the entire output

    return answer

In [None]:
# Check if GPU is available, otherwise default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# model = model.to(device

In [None]:
## Testing
# Example query for inclusive language
query = '''According to the Style Manuel about the use of Inclusive language, please correct the following writing to inclusive language, provide the reference section from the provided document.
writing:
ATSI is a minority ethics group in the Australian community.
'''
answer = generate_answer(query)
print(answer)

#### Json conversion

In [None]:
import json
from google.colab import files

In [None]:
# Step 2: Save the dictionary as a JSON file
json_filename = "StyleManuel.json"
with open(json_filename, 'w') as json_file:
    json.dump(cleaned_text, json_file, indent=4)

# Step 3: Download the JSON file
files.download(json_filename)

# UI Implementation

## gradio

In [None]:
%%capture
!pip install gradio

In [None]:
import gradio as gr

# Define the Gradio interface function
def process_query(user_query):
    return generate_answer(user_query)


In [None]:
# Define the Gradio interface function
def process_user_writing(user_writing):
    # Call the generate_answer function, which includes the custom prompt
    return generate_answer(user_writing)

In [None]:
# Create the Gradio interface
iface = gr.Interface(
    fn=process_user_writing,  # Function to process user writing
    inputs="text",            # User inputs their writing in plain text
    outputs="text",           # The system outputs the rewritten text
    title="Australian Government Style Rewriter",
    description="Enter your text in plain English, and the system will rewrite it according to the Australian Government writing style."
)

# Launch the Gradio interface
iface.launch(debug=False)
