# RAG

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

from sklearn.metrics import accuracy_score

load_dotenv()

True

In [2]:
# Read the processed language identification dataset
open_lid_data = pd.read_csv("data/open_lid_10_samples.csv")
open_lid_data

Unnamed: 0,text,language
0,ary soraty eo amin'ny tolam-baravaran'ny trano...,Plateau Malgasy
1,Rehefa namatotra azy tamin'ny kofehy hoditra i...,Plateau Malgasy
2,Io menaka manitra io dia afaka namidy mihoatry...,Plateau Malgasy
3,"Ireo anjely, ireo manam-pahefana, ary ny hery ...",Plateau Malgasy
4,Nefa Estera tsy mbola nilaza ny fireneny na ny...,Plateau Malgasy
...,...,...
1351,{{Infobox writer | name = فرانسوا ویون | embed...,South Azerbaijani
1352,آدری مدوز (اینگیلیسجه: Audrey Meadows) آمریکال...,South Azerbaijani
1353,قارغیدالی (بیتکی) (اینگیلیسجه: Maize ، (فارسجا...,South Azerbaijani
1354,339 ایلینه قده¬ر بالچیک Balchic توْرپاقلارینا ...,South Azerbaijani


In [3]:
# Modifying the text for RAG
# Adding language at the end and formatting it as follows:
# {text} -> Text: {text} Language Identified: {language}
open_lid_data_rag = open_lid_data.__deepcopy__()
open_lid_data_rag['text'] = open_lid_data_rag.apply(lambda x: f"Text:\n {x['text']} \n Language Identified:\n {x['language']}", axis=1)
open_lid_data_rag

Unnamed: 0,text,language
0,Text:\n ary soraty eo amin'ny tolam-baravaran'...,Plateau Malgasy
1,Text:\n Rehefa namatotra azy tamin'ny kofehy h...,Plateau Malgasy
2,Text:\n Io menaka manitra io dia afaka namidy ...,Plateau Malgasy
3,"Text:\n Ireo anjely, ireo manam-pahefana, ary ...",Plateau Malgasy
4,Text:\n Nefa Estera tsy mbola nilaza ny firene...,Plateau Malgasy
...,...,...
1351,Text:\n {{Infobox writer | name = فرانسوا ویون...,South Azerbaijani
1352,Text:\n آدری مدوز (اینگیلیسجه: Audrey Meadows)...,South Azerbaijani
1353,Text:\n قارغیدالی (بیتکی) (اینگیلیسجه: Maize ،...,South Azerbaijani
1354,Text:\n 339 ایلینه قده¬ر بالچیک Balchic توْرپا...,South Azerbaijani


In [4]:
open_lid_data = open_lid_data.sample(n=100, random_state=42)
# Separate text and labels
open_lid_data_text = list(open_lid_data['text'])
open_lid_data_labels = list(open_lid_data['language'])

In [72]:
def set_model(model: str) -> None:
    global llm
    llm = HuggingFaceEndpoint(
        repo_id=model,
        temperature=0.1,
        huggingfacehub_api_token=os.environ['HUGGINGFACEHUB_API_TOKEN']
    )

def calculate_accuracy(labels: list, predictions: list) -> float:
    correct_predictions = sum([1 for p, l in zip(predictions, labels) if p in l])
    return float(correct_predictions/len(labels))

def format_result(result: str) -> str:
    result = result.strip().replace("\n","").replace(".","").replace("Answer", "").replace("Human", "").replace("Text", "").replace(":", "").strip()
    if result:
        result = result.split()[0]
    return result

def get_rag_query_results(data: list, retrieval_chain) -> list:
    query_results = []
    for i, text in enumerate(data):
        print(f"{text=}")
        try:
            result = retrieval_chain.invoke(input={"input": text})
            result = format_result(result['answer'])
            query_results.append(result)
            print(f"{result=}")
        except Exception as exc:
            print(f"Encountered exception {exc=}")
            break
    return query_results

def ingest_into_vectorstore(df: pd.DataFrame, column: str) -> None:
    # Load document
    loader = DataFrameLoader(data_frame=open_lid_data, page_content_column="text")
    document = loader.load()
    print("Splitting...")
    # Split into chunks
    text_splitter = CharacterTextSplitter()
    texts = text_splitter.split_documents(document)
    print(f"Created {len(texts)} chunks.")
    # Create embeddings object
    embeddings = HuggingFaceEmbeddings()
    print("Ingesting...")
    # Ingest into the vector store
    PineconeVectorStore.from_documents(
        documents=texts,
        embedding=embeddings,
        index_name=os.environ['INDEX_NAME']
    )

def get_retrieval_chain(vectorstore, retrieval_prompt, search_kwargs=None):
    # Chain to stuff documents into a prompt and then pass that to an LLM
    combine_docs_chain = create_stuff_documents_chain(llm=llm, prompt=retrieval_prompt)
    # Create retreiver
    if search_kwargs:
        retriever=vectorstore.as_retriever(search_kwargs=search_kwargs)
    else:
        retriever=vectorstore.as_retriever()
    # Chain to retrieve information from the vector store and run the combine_docs_chain
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=combine_docs_chain
    )
    return retrieval_chain

In [24]:
# Initialize embeddings
embeddings = HuggingFaceEmbeddings()
# Initialize vector store
vectorstore = PineconeVectorStore(
    index_name=os.environ['INDEX_NAME'], embedding=embeddings
)
# Create prompt to be used with the retreived documents 
retrieval_prompt = PromptTemplate.from_template(template="""
                    You are an AI language detection assistant.
                    Here are some examples of Text and its language in one word:

                    {context}

                    Provide the language of the given text. Answer ONLY with one word. Answer ONLY in the Latin script.
                    If you do not know the language, just say "I'm not sure". Don't try to make up an answer.
                    Text:

                    {input}
                    
                    Language Identified:
                    """
)



## RAG with default settings

### Mixtral-8x7B-Instruct-v0.1

In [None]:
set_model(model="mistralai/Mixtral-8x7B-Instruct-v0.1")

  warn_deprecated(


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/saad/.cache/huggingface/token
Login successful


In [21]:
retrieval_chain = get_retrieval_chain(vectorstore=vectorstore, retrieval_prompt=retrieval_prompt)
mixtral_rag_results = get_rag_query_results(data=open_lid_data_text, retrieval_chain=retrieval_chain)

text='Од 10-ти до 24-ми мај во Софија и други бугарски градови ќе се одржи Европскиот музички фестивал, во рамките на кој ќе се одржат концерти на класична, џез и етно музика, како и конференција насловена како „Културата во ЕУ: Меѓуповрзување и идентитет“.'
result='Bulgarian'
text='و هو عنده سنه جاله مرض فى عينه اسمه الرمد الحبيبى و طبعا عشان عيلته فقيره استعملت امه وصفات بلدى فى علاج عينه فأتعمى بسبب الجهل و الفقر اللى كان عيشين فيه عيلته'
result='Arabic'
text='في سؤال مهم أستاذ زهير المادة 3 من الدستور الجديد المعدل الحالي في سوريا تنص على أن دين رئيس الجمهورية هو الإسلام أه أنتم في وثيقة العهد تقدمتم على هذه المادة وقلتم يحق لكل مواطن دون استثناء الوصول لأعلى مراتب نشير هنا كما أشار التوجه أستاذ توفيق إلى قضية ورئاسة الدولة تحق لكل مواطن سوري في سوريا المستقبل'
result='Arabic'
text='ہر کوئی یقین دہانی میں مصروف ہے کہ کیرالہ کے لوگوں کی مصیبت کم سے کم کی جا سکے ، ان کی تکلیف کو ہم بانٹیں ۔'
result='Urdu'
text='ໂກເຣ ດາ ຖານ ແລະ ອາບີຣາມ ອິດສາ ໂມເຊ ຈົນ ຕໍ ່ ຕ ້ ານ ລາວ.'
result='Lao'
tex

In [22]:
calculate_accuracy(labels=open_lid_data_labels, predictions=mixtral_rag_results)

0.56

In [23]:
# Error analysis
for i,j in zip(open_lid_data_labels, mixtral_rag_results):
    if j not in i:
        print(f"True: {i} and Predicted: {j}")

True: Macedonian and Predicted: Bulgarian
True: Meitei and Predicted: Bengali
True: Tswana and Predicted: Sesotho
True: Somali and Predicted: Swedish
True: Georgian and Predicted: srd_Latn
True: Luba-Kasai and Predicted: Tumbuka
True: Kikongo and Predicted: Ilocano
True: Banjar and Predicted: Indonesian
True: Najdi Arabic and Predicted: Gulf
True: Banjar and Predicted: Javanese
True: Luxembourgish and Predicted: Lithuanian
True: Nepali and Predicted: Hindi
True: Eastern Panjabi and Predicted: Punjabi
True: Nyanja and Predicted: Chichewa
True: Guarani and Predicted: Nahuatl
True: Magahi and Predicted: Hindi
True: Southern Pasto and Predicted: Pashto
True: Bosnian and Predicted: Serbian
True: English and Predicted: Spanish
True: Magahi and Predicted: Hindi
True: Kashmiri and Predicted: Pashto
True: Crimean Tatar and Predicted: Uzbek
True: Luo and Predicted: Dinka
True: Banjar and Predicted: Malay
True: Tamasheq and Predicted: Berber
True: Kimbundu and Predicted: I'm
True: Tswana and Pred

#### Observations

* We see an improvement in accuracy: 44% (without RAG) -> 56%

Some of the observations without RAG still hold:

* In some cases the output is close but didn't match the labels:
    * Eastern Panjabi vs Punjabi (Incorrect spelling in the dataset labels)
    * Southern Pasto vs Pashto (Incorrect spelling in the dataset labels)

* In other cases, the predicted language seems to be a close family language

### Meta-Llama-3-8B-Instruct

In [28]:
set_model(model="meta-llama/Meta-Llama-3-8B-Instruct")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/saad/.cache/huggingface/token
Login successful


In [33]:
retrieval_chain = get_retrieval_chain(vectorstore=vectorstore, retrieval_prompt=retrieval_prompt)
llama_rag_results = get_rag_query_results(data=open_lid_data_text, retrieval_chain=retrieval_chain)

text='Од 10-ти до 24-ми мај во Софија и други бугарски градови ќе се одржи Европскиот музички фестивал, во рамките на кој ќе се одржат концерти на класична, џез и етно музика, како и конференција насловена како „Културата во ЕУ: Меѓуповрзување и идентитет“.'
result='Macedonian'
text='و هو عنده سنه جاله مرض فى عينه اسمه الرمد الحبيبى و طبعا عشان عيلته فقيره استعملت امه وصفات بلدى فى علاج عينه فأتعمى بسبب الجهل و الفقر اللى كان عيشين فيه عيلته'
result='Arabic'
text='في سؤال مهم أستاذ زهير المادة 3 من الدستور الجديد المعدل الحالي في سوريا تنص على أن دين رئيس الجمهورية هو الإسلام أه أنتم في وثيقة العهد تقدمتم على هذه المادة وقلتم يحق لكل مواطن دون استثناء الوصول لأعلى مراتب نشير هنا كما أشار التوجه أستاذ توفيق إلى قضية ورئاسة الدولة تحق لكل مواطن سوري في سوريا المستقبل'
result='South'
text='ہر کوئی یقین دہانی میں مصروف ہے کہ کیرالہ کے لوگوں کی مصیبت کم سے کم کی جا سکے ، ان کی تکلیف کو ہم بانٹیں ۔'
result='Urdu'
text='ໂກເຣ ດາ ຖານ ແລະ ອາບີຣາມ ອິດສາ ໂມເຊ ຈົນ ຕໍ ່ ຕ ້ ານ ລາວ.'
result="I'm"
tex

In [35]:
calculate_accuracy(labels=open_lid_data_labels, predictions=llama_rag_results)

0.51

In [36]:
# Error analysis
for i,j in zip(open_lid_data_labels, mixtral_rag_results):
    if j not in i:
        print(f"True: {i} and Predicted: {j}")

True: Najdi Arabic and Predicted: South
True: Lao and Predicted: I'm
True: Meitei and Predicted: Bengali
True: Sanskrit and Predicted: Hindi
True: Tswana and Predicted: (One
True: Assamese and Predicted: Bengali
True: Somali and Predicted: Swedish
True: Georgian and Predicted: Please
True: Luba-Kasai and Predicted: Kinyarwanda
True: Kikongo and Predicted: I'm
True: Banjar and Predicted: Malayalam
True: Najdi Arabic and Predicted: North
True: Banjar and Predicted: I'm
True: Slovenian and Predicted: Bosnian
True: Sindhi and Predicted: Urdu
True: Wolof and Predicted: I'm
True: Eastern Panjabi and Predicted: Kannada
True: Nyanja and Predicted: Chichewa
True: Guarani and Predicted: Malagasy
True: Magahi and Predicted: Hindi
True: Southern Pasto and Predicted: Pashto
True: Bosnian and Predicted: Croatian
True: Magahi and Predicted: (Note
True: Luo and Predicted: (Write
True: Banjar and Predicted: I'm
True: Kimbundu and Predicted: ?
True: Sindhi and Predicted: Urdu
True: Tswana and Predicted:

#### Obervations

* The Llama model starts working with RAG.
* Accuracy: 0 (without RAG) -> 51 %
* It's clear that the Mixtral model is better suited for this task

## RAG with more documents retreived

In [37]:
set_model(model="mistralai/Mixtral-8x7B-Instruct-v0.1")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/saad/.cache/huggingface/token
Login successful


### 6 Documents

In [44]:
retrieval_chain = get_retrieval_chain(vectorstore=vectorstore, retrieval_prompt=retrieval_prompt, search_kwargs={"k": 6})
mixtral_rag_results = get_rag_query_results(data=open_lid_data_text, retrieval_chain=retrieval_chain)

text='Од 10-ти до 24-ми мај во Софија и други бугарски градови ќе се одржи Европскиот музички фестивал, во рамките на кој ќе се одржат концерти на класична, џез и етно музика, како и конференција насловена како „Културата во ЕУ: Меѓуповрзување и идентитет“.'
result='Macedonian'
text='و هو عنده سنه جاله مرض فى عينه اسمه الرمد الحبيبى و طبعا عشان عيلته فقيره استعملت امه وصفات بلدى فى علاج عينه فأتعمى بسبب الجهل و الفقر اللى كان عيشين فيه عيلته'
result='Egyptian'
text='في سؤال مهم أستاذ زهير المادة 3 من الدستور الجديد المعدل الحالي في سوريا تنص على أن دين رئيس الجمهورية هو الإسلام أه أنتم في وثيقة العهد تقدمتم على هذه المادة وقلتم يحق لكل مواطن دون استثناء الوصول لأعلى مراتب نشير هنا كما أشار التوجه أستاذ توفيق إلى قضية ورئاسة الدولة تحق لكل مواطن سوري في سوريا المستقبل'
result='Arabic'
text='ہر کوئی یقین دہانی میں مصروف ہے کہ کیرالہ کے لوگوں کی مصیبت کم سے کم کی جا سکے ، ان کی تکلیف کو ہم بانٹیں ۔'
result='Malayalam'
text='ໂກເຣ ດາ ຖານ ແລະ ອາບີຣາມ ອິດສາ ໂມເຊ ຈົນ ຕໍ ່ ຕ ້ ານ ລາວ.'
result='

In [45]:
calculate_accuracy(labels=open_lid_data_labels, predictions=mixtral_rag_results)

0.61

In [46]:
# Error analysis
for i,j in zip(open_lid_data_labels, mixtral_rag_results):
    if j not in i:
        print(f"True: {i} and Predicted: {j}")

True: Urdu and Predicted: Malayalam
True: Meitei and Predicted: I'm
True: Tswana and Predicted: Southern
True: Somali and Predicted: Swedish
True: Georgian and Predicted: I'm
True: Luba-Kasai and Predicted: Chichewa
True: Kikongo and Predicted: Ilocano
True: Banjar and Predicted: Sundanese
True: Luxembourgish and Predicted: Lithuanian
True: Nepali and Predicted: Hindi
True: Eastern Panjabi and Predicted: Punjabi
True: Nyanja and Predicted: Chichewa
True: Guarani and Predicted: Nahuatl
True: Magahi and Predicted: Hindi
True: Southern Pasto and Predicted: Pashto
True: Bosnian and Predicted: Croatian
True: English and Predicted: Spanish
True: Magahi and Predicted: Hindi
True: Kashmiri and Predicted: Pashto
True: Crimean Tatar and Predicted: Uzbek
True: Banjar and Predicted: Sudanese
True: Kimbundu and Predicted: I'm
True: Friulian and Predicted: Emilian-Romagnol
True: Shan and Predicted: Burmese
True: Papiamento and Predicted: Esperanto
True: Kimbundu and Predicted: Umbundu
True: Kamba an

#### Obervations

* Accuracy improvement with more documents retrieved: 56% (4 docs) -> 61% (6 docs)
* As expected providing more examples in the context helps

### 10 Documents

In [47]:
retrieval_chain = get_retrieval_chain(vectorstore=vectorstore, retrieval_prompt=retrieval_prompt, search_kwargs={"k": 10})
mixtral_rag_results = get_rag_query_results(data=open_lid_data_text, retrieval_chain=retrieval_chain)

text='Од 10-ти до 24-ми мај во Софија и други бугарски градови ќе се одржи Европскиот музички фестивал, во рамките на кој ќе се одржат концерти на класична, џез и етно музика, како и конференција насловена како „Културата во ЕУ: Меѓуповрзување и идентитет“.'
result='Bulgarian'
text='و هو عنده سنه جاله مرض فى عينه اسمه الرمد الحبيبى و طبعا عشان عيلته فقيره استعملت امه وصفات بلدى فى علاج عينه فأتعمى بسبب الجهل و الفقر اللى كان عيشين فيه عيلته'
result='Arabic'
text='في سؤال مهم أستاذ زهير المادة 3 من الدستور الجديد المعدل الحالي في سوريا تنص على أن دين رئيس الجمهورية هو الإسلام أه أنتم في وثيقة العهد تقدمتم على هذه المادة وقلتم يحق لكل مواطن دون استثناء الوصول لأعلى مراتب نشير هنا كما أشار التوجه أستاذ توفيق إلى قضية ورئاسة الدولة تحق لكل مواطن سوري في سوريا المستقبل'
result='Arabic'
text='ہر کوئی یقین دہانی میں مصروف ہے کہ کیرالہ کے لوگوں کی مصیبت کم سے کم کی جا سکے ، ان کی تکلیف کو ہم بانٹیں ۔'
result='Urdu'
text='ໂກເຣ ດາ ຖານ ແລະ ອາບີຣາມ ອິດສາ ໂມເຊ ຈົນ ຕໍ ່ ຕ ້ ານ ລາວ.'
result='Lao'
tex

In [50]:
calculate_accuracy(labels=open_lid_data_labels[:96], predictions=mixtral_rag_results)

0.6354166666666666

In [51]:
# Error analysis
for i,j in zip(open_lid_data_labels[:96], mixtral_rag_results):
    if j not in i:
        print(f"True: {i} and Predicted: {j}")

True: Macedonian and Predicted: Bulgarian
True: Meitei and Predicted: I'm
True: Tswana and Predicted: Southern
True: Somali and Predicted: Swedish
True: Georgian and Predicted: I'm
True: Kikongo and Predicted: Ilocano
True: Najdi Arabic and Predicted: Saudi
True: Slovenian and Predicted: Croatian
True: Luxembourgish and Predicted: Lithuanian
True: Nepali and Predicted: Hindi
True: Eastern Panjabi and Predicted: Punjabi
True: Nyanja and Predicted: Chichewa
True: Guarani and Predicted: Guaraní
True: Magahi and Predicted: Hindi
True: Southern Pasto and Predicted: Pashto
True: English and Predicted: Spanish
True: Magahi and Predicted: Hindi
True: Kashmiri and Predicted: Urdu
True: Crimean Tatar and Predicted: Uzbek
True: Luo and Predicted: Dinka
True: Banjar and Predicted: Sundanese
True: Kimbundu and Predicted: I'm
True: Friulian and Predicted: Emilian-Romagnol
True: Shan and Predicted: Burmese
True: Kimbundu and Predicted: Umbundu
True: Kamba and Predicted: Kikuyu
True: Tamasheq and Pred

#### Observations

* Accuracy improves even further: 61% (4 documents) -> 63.5% (10 documents)