# Retrieval Augmented Generation (RAG) for Legal Questions
##### TODO:
- Try other LLMs
- Plot the ratio of correct answers for each question with a rag model and with a standard LLM

In [95]:
import requests
import json
import regex as re
from IPython.display import display, Math
from nltk.stem import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.corpus import wordnet
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

# Stop Words

In [96]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
			  'you', 'you\'re', 'you\'ve', 'you\'ll', 'you\'d', 'your', 'yours', 
			  'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 
			  'she\'s', 'her', 'hers', 'herself', 'it', 'it\'s', 'its', 'itself', 
			  'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
			  'who', 'whom', 'this', 'that', 'that\'ll', 'these', 'those', 'am', 
			  'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 
			  'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 
			  'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
			  'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
			  'through', 'during', 'before', 'after', 'above', 'below', 'to', 
			  'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
			  'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 
			  'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 
			  'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 
			  'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 
			  'should', 'should\'ve', 'now', 'd', 'll', 'm', 'o', 're', 
			  've', 'y', 'ain', 'aren', 'aren\'t', 'couldn', 'couldn\'t', 'didn', 'didn\'t', 
			  'doesn', 'doesn\'t', 'hadn', 'hadn\'t', 'hasn', 'hasn\'t', 'haven', 'haven\'t', 
			  'isn', 'isn\'t', 'ma', 'mightn', 'mightn\'t', 'mustn', 'mustn\'t', 'needn', 
			  'needn\'t', 'shan', 'shan\'t', 'shouldn', 'shouldn\'t', 'wasn', 'wasn\'t', 
			  'weren', 'weren\'t', 'won', 'won\'t', 'wouldn', 'wouldn\'t', '....', '...', '..', 'i am'
              , 'of', ]

punctuation = [",", "?", "!", ".", ";", ":", "/", "(", ")", "&", "_", "+", "=", "<", ">", "\n"]

# Corpus

In [97]:
doc1 = open("corpus\doc1.txt", "r")
doc2 = open("corpus\doc2.txt", "r")
doc3 = open("corpus\doc3.txt", "r")

corpus = [
    doc1.read(),
    doc2.read(),
    doc3.read()
]

print(corpus)

['Any processing of personal data of should be lawful and fair.\nIt should be transparent to natural persons that personal data concerning them are collected, used, consulted or otherwise processed \nand to what extent the personal data are or will be processed. The principle of transparency requires that any information and \ncommunication relating to the processing of those personal data be easily accessible and easy to understand, and that clear and \nplain language be used. That principle concerns, in particular, information to the data subjects on the identity of the controller \nand the purposes of the processing and further information to ensure fair and transparent processing in respect of the natural \npersons concerned and their right to obtain confirmation and communication of personal data concerning them which are being processed. \nNatural persons should be made aware of risks, rules, safeguards and rights in relation to the processing of personal data and how \nto exerci

# NLP Pipeline

In [98]:
def remove_punctuation(tweet):
	return re.sub(r'[^\w\s]', " ", tweet) 

def tokenize(text):
	return text.strip().split(" ")

def lemming(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def stemming(tokens):
    p = PorterStemmer()
    [p.stem(word) for word in tokens]
    return tokens

def remove_stopwords(words):
    return [word.lower().replace('\n', '') for word in words if word.lower().strip() and word.lower() not in stop_words]

def nlp_pipeline(text):
    return remove_stopwords(stemming(tokenize(remove_punctuation(text))))

In [99]:
doc1_pr = nlp_pipeline(corpus[0])
doc2_pr = nlp_pipeline(corpus[1])

print(doc1_pr)
print(doc2_pr)

['processing', 'personal', 'data', 'lawful', 'fair', 'it', 'transparent', 'natural', 'persons', 'personal', 'data', 'concerning', 'collected', 'used', 'consulted', 'otherwise', 'processed', 'and', 'extent', 'personal', 'data', 'processed', 'principle', 'transparency', 'requires', 'information', 'communication', 'relating', 'processing', 'personal', 'data', 'easily', 'accessible', 'easy', 'understand', 'clear', 'plain', 'language', 'used', 'principle', 'concerns', 'particular', 'information', 'data', 'subjects', 'identity', 'controller', 'and', 'purposes', 'processing', 'information', 'ensure', 'fair', 'transparent', 'processing', 'respect', 'natural', 'persons', 'concerned', 'right', 'obtain', 'confirmation', 'communication', 'personal', 'data', 'concerning', 'processed', 'natural', 'persons', 'made', 'aware', 'risks', 'rules', 'safeguards', 'rights', 'relation', 'processing', 'personal', 'data', 'to', 'exercise', 'rights', 'relation', 'processing', 'particular', 'specific', 'purposes'

# Bag of Words Approach 

## Jaccard Similarity

In [100]:
display(Math(r'\text{JS}(A, B) = \frac{\vert A \cap B \vert}{\vert A \cup B \vert} = \frac{\vert A \cap B \vert}{\vert A \vert + \vert B \vert - \vert A \cap B \vert}'))

<IPython.core.display.Math object>

In [101]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

## Sørensen-Dice Similarity 

In [102]:
display(Math(r'\text{SD}(A, B) = \frac{2 \times \vert A \cap B \vert}{\vert A \vert + \vert B \vert}'))

<IPython.core.display.Math object>

In [103]:
def sorensen_dice_similarity(query, document):
    intersection = set(query).intersection(set(document))
    return 2*len(intersection)/(len(query) + len(document))

## Wu-Palmer Similarity

In [104]:
display(Math(r'\text{WP}(A, B) = \frac{2 \times \text{depth}(\text{lcs}(A, B))}{\text{depth}(A) + \text{depth}(B)}'))

<IPython.core.display.Math object>

In [105]:
def wu_palmer_similarity(word1, word2):
    w1 = wordnet.synsets(word1)
    w2 = wordnet.synsets(word2)
    if len(w1) == 0 or len(w2) == 0:
        return 0
    return wordnet.wup_similarity(w1[0], w2[0])

wu_palmer_similarity("dog", "cat")

0.8571428571428571

# Word Embeddings Approach

## Cosine Similarity
Using Open-AI word embeddings to calculate the cosine similarity between two words

In [106]:
display(Math(r'\text{CS}(A, B) = \cos(\theta) = \frac{A \cdot B}{\|A\| \times \|B\|}'))

<IPython.core.display.Math object>

In [107]:
def get_embedding(word, model="text-embedding-3-small"):
   client = OpenAI()
   return [client.embeddings.create(input = [word], model=model).data[0].embedding]

In [None]:
word1 = get_embedding("dog")
word2 = get_embedding("dogs")
print(cosine_similarity(word1, word2))

[[0.84316793]]


# Getting Most Similarity Document

In [None]:
def get_most_likely_document_bag_of_words(query, corpus):
    similarities = []
    query = nlp_pipeline(query)
    for doc in corpus:
        document = nlp_pipeline(doc)
        similarity = jaccard_similarity(query, document)
        #similarity = sorensen_dice_similarity(query, doc)
        print(similarity)
        similarities.append(similarity)
    return corpus[similarities.index(max(similarities))]

In [112]:
def get_most_likely_document_word_embedding(query, corpus):
    similarities = []
    query = get_embedding(query)
    for doc in corpus:
        document = get_embedding(doc)
        similarity = cosine_similarity(query, document)
        print(similarity)
        similarities.append(similarity)
    return corpus[similarities.index(max(similarities))]

In [120]:
get_most_likely_document_bag_of_words("Can I store personal data for a forbidden purposes?", corpus)

0.02912621359223301
0.03125
0.0


"An operator is required to obtain verifiable parental consent before any collection, use, or disclosure of personal information \nfrom children , including consent to any material change in the collection, use, or disclosure practices to which the parent has \npreviously consented. An operator must give the parent the option to consent to the collection and use of the child's personal \ninformation without consenting to disclosure of his or her personal information to third parties."

In [124]:
get_most_likely_document_word_embedding("Can I store personal data for a forbidden purposes?", corpus)

[[0.47168479]]
[[0.39319085]]
[[0.22585583]]


'Any processing of personal data of should be lawful and fair.\nIt should be transparent to natural persons that personal data concerning them are collected, used, consulted or otherwise processed \nand to what extent the personal data are or will be processed. The principle of transparency requires that any information and \ncommunication relating to the processing of those personal data be easily accessible and easy to understand, and that clear and \nplain language be used. That principle concerns, in particular, information to the data subjects on the identity of the controller \nand the purposes of the processing and further information to ensure fair and transparent processing in respect of the natural \npersons concerned and their right to obtain confirmation and communication of personal data concerning them which are being processed. \nNatural persons should be made aware of risks, rules, safeguards and rights in relation to the processing of personal data and how \nto exercis

# LLMs 

## LLAMA2

In [None]:
def get_llama_response(query, corpus):
    doc = get_most_likely_document_bag_of_words(query, corpus) 
    
    prompt = """
                You are a bot that answer simple legal questions. 
                This is the info you have to use to answer: {relevant_document}
                The user input is: {user_input}
                Compile an answer to the user based on the relevant document.
                Answer in 15 words or less.
             """
    
    url = 'http://localhost:11434/api/generate'
    
    data = {
        "model": "llama2",
        "prompt": prompt.format(user_input=query, relevant_document=doc)
    }
    
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
        
    full_response = []
    try:
        for line in response.iter_lines():
            if line:
                decoded_line = json.loads(line.decode('utf-8')) 
                full_response.append(decoded_line['response'])
    finally:
        response.close()
        
    return ''.join(full_response)

In [None]:
query = "Can I record children and post the video on a social media if the parents give me the legal consent?"
answer = get_llama_response(query, corpus)
print(answer)

0.0
0.08571428571428572
0.0


No, you cannot record children and post the video on social media without proper legal consent. Parental consent is required before collecting, using, or disclosing personal information from children.


In [None]:
get_llama_response("Can I store personal data for a forbidden purposes", corpus)

0.02912621359223301
0.03125
0.0


' No, you cannot store personal data for forbidden purposes without proper consent and transparency.'

In [None]:
get_llama_response("Can I store personal data for a lecit purposes", corpus)

0.02912621359223301
0.03125
0.0


'Yes, you can store personal data for legitimate purposes with proper parental consent.'

## GPT-4-Turbo

In [None]:
def get_openai_response(query, corpus):
    client = OpenAI()
    doc = get_most_likely_document_bag_of_words(query, corpus)
    context = doc + "\n" + f"User Query: {query}"  
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": """ 
                                                You are a bot that answers simple legal questions. 
                                                Compile an answer based only on the context i gave you.
                                                Don't say that information I'm giving are wrong cause it's a simulation.
                                                Cite also the line number where you found the answer.
                                                Answer in 30 words or less.
                                                Your output should be a JSON."""},
            {"role": "user", "content": context} 
        ]
    )
    return response.choices[0].message.content

In [None]:
def get_openai_response_virgin(query):
    client = OpenAI()

    response = client.chat.completions.create(
    model="gpt-4-turbo",
    response_format={ "type": "json_object" },
    messages = [
        {"role": "system", "content": """ 
                                            You are a bot that answer simple legal questions. 
                                            Compile an answer to the user based on the relevant document.
                                            Answer in 30 words or less.
                                            Your output should be a JSON."""},
        {"role": "user", "content": query}
    ]
    )
    return response.choices[0].message.content

In [None]:
get_openai_response("Can I record children and post the video on a social media if the parents give me the legal consent?", corpus)

0.0
0.08571428571428572
0.0


'{\n    "answer": "Yes, if you have verifiable parental consent specifically allowing the posting of the video. (Lines 2-5)"\n}'

In [None]:
get_openai_response("Should I keep transparency in clients data?", corpus)

0.019417475728155338
0.0
0.0


'{\n  "answer": "Yes, transparency in processing client\'s personal data is required for lawfulness and fairness. Data subjects must be informed of data collection and use.",\n  "line": 1\n}'

In [None]:
get_openai_response("Is it legal to buy dogs in alabama?", corpus)

0.0
0.0
0.16666666666666666


'{\n    "answer": "No, it is not legal to buy dogs in Alabama as dogs are banished from the USA. (line 1)"\n}'

In [None]:
get_openai_response_virgin("Can I record children and post the video on a social media if the parents give me the legal consent?")

'{\n  "answer": "Yes, you can legally record children and post the video on social media if you have consent from the parents."\n}'