# Retrieval Augmented Generation (RAG)
##### TODO:
- Preprocess corpus and queries:
    - Tokenize
    - Lowercase
    - Remove punctuation
    - Remove stopwords
    - Stemming
- Implement more similarity measures
- Try other LLMs

In [42]:
from IPython.display import display, Math, Latex
import requests
import json

In [43]:
corpus = [
    "It's illegal to cut a tree in your backyard.",
    "It's legal to cut a tree in your backyard only if it's dead.",
]

## Jaccard Similarity

In [44]:
display(Math(r'\text{JS}(A, B) = \frac{\vert A \cap B \vert}{\vert A \cup B \vert} = \frac{\vert A \cap B \vert}{\vert A \vert + \vert B \vert - \vert A \cap B \vert}'))

<IPython.core.display.Math object>

In [45]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

## Sorensen-Dice Similarity 

In [46]:
display(Math(r'\text{Sørensen-Dice}(A, B) = \frac{2 \times \vert A \cap B \vert}{\vert A \vert + \vert B \vert}'))

<IPython.core.display.Math object>

In [47]:
def sorense_dice_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    return 2*len(intersection)/(len(query) + len(document))

##### Testing Similarity Measures

In [48]:
def get_most_likely_document(query, corpus):
    similarities = []
    for doc in corpus:
        similarity = jaccard_similarity(query, doc)
        #similarity = sorense_dice_similarity(query, doc)
        print(similarity)
        similarities.append(similarity)
    return corpus[similarities.index(max(similarities))]

In [49]:
get_most_likely_document("Can i cut a tree in my backyard?", corpus)

0.3076923076923077
0.25


"It's illegal to cut a tree in your backyard."

In [50]:
get_most_likely_document("I found a dead tree in my garden, is it legale to cut it?", corpus)

0.2777777777777778
0.23809523809523808


"It's illegal to cut a tree in your backyard."

## Adding LLM (LLAMA2) to augment the generated sentences

In [51]:
def get_llama_response(query, corpus):
    doc = get_most_likely_document(query, corpus) # compute similarity to detect the most usefull document
    
    prompt = """
                You are a bot that answer simple legal questions. 
                You answer in very short sentence and do not include extra information.
                This is the info you have to use to answer: {relevant_document}
                The user input is: {user_input}
                Compile an answer to the user based on the info and the user input.
             """
    
    url = 'http://localhost:11434/api/generate'
    
    data = {
        "model": "llama2",
        "prompt": prompt.format(user_input=query, relevant_document=doc)
    }
    
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
        
    full_response = []
    try:
        for line in response.iter_lines():
            if line:
                decoded_line = json.loads(line.decode('utf-8')) 
                full_response.append(decoded_line['response'])
    finally:
        response.close()
        
    return ''.join(full_response)

In [52]:
get_llama_response("Can i cut a tree in my backyard?", corpus)

0.3076923076923077
0.25


' No, you cannot cut a tree in your backyard as it is illegal.'

In [53]:
get_llama_response("I found a dead tree in my garden, can i cut it?", corpus)

0.25
0.21052631578947367


' No, you cannot cut the dead tree in your garden. It is illegal to cut a tree in your backyard.'