In [1]:
import pandas as pd

In [34]:
data = pd.read_csv('dataset/conversation.csv')

In [35]:
data.head()

Unnamed: 0,conversation,conv_id
0,Are you a fan of Google or Microsoft?\nBoth ar...,1
1,do you like dance?\nYes I do. Did you know Br...,2
2,Hey what's up do use Google very often?I reall...,3
3,Hi! do you like to dance?\nI love to dance a ...,4
4,do you like dance?\nI love it. Did you know Br...,5


In [36]:
len(data)

100

In [4]:
import transformers
import torch
from transformers import AutoTokenizer

In [5]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    device_map="auto",
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    max_length=4000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


# Question generation using Llama 2 with few short prompting

In [12]:
from collections import defaultdict
from tqdm import tqdm
import ipdb

q = defaultdict(list)
a = defaultdict(list)
for i in tqdm(range(1,50)):
    prompt = f"""<s>[INST] Generates only 3 question and answer pairs bases on the converstation provided in triple back ticks ```{data.conversation[0]}``` \
use <question> and <ans> tags to indicate them:

<question> Do you like cats?
<ans> The cat is referred as domestic cat and wild cat. They make our world very clean from rats!
<question> Why do you think cats spend a lot of their time sleeping?
<ans> Cats hear the sounds too faint or too high frequency human ears can hear.
<question> What are some of the services provided by Google?
<ans> Google provides online related services and products, which includes online ads, search engine and cloud computing.
[/INST]</s>

<s>[INST] Generates only 3 question and answer pairs bases on the converstation provided in triple back ticks ```{data.conversation[i]}``` \
use <question> and <ans> tags to indicate them:

"""
    seq = pipeline(prompt) 
    x = seq[0]['generated_text'].split(prompt)
    for elm in x[1].split("\n"):
        if elm.find("<question>")!=-1:
            q[i+1].append(elm)
        elif elm.find("<ans>")!=-1:
            a[i+1].append(elm)
    # print(q,a)
    # ipdb.set_trace()

 84%|███████████████████████▍    | 41/49 [06:44<01:18,  9.85s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 420.00 MiB (GPU 0; 10.75 GiB total capacity; 7.91 GiB already allocated; 357.62 MiB free; 8.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [15]:
q.keys()

dict_keys([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42])

In [16]:
ques,anss,conv_id = [],[],[]
for k in q.keys():
    for que,ans in zip(q[k],a[k]):
        if ans is not None:
            ques.append(que)
            anss.append(ans)
            conv_id.append(k)

pd.DataFrame({"Question":ques,"Answer":anss,"Conv_id":conv_id}).to_csv("dataset/gen_qa_pair.csv",index=False)

In [None]:
out = pd.read_csv("dataset/gen_qa_pair.csv")

# Overlap

In [None]:
1

chunk: 1
m1
m2
m3
m4
m5
m6

chunk: 2
m7
m8
--
--


chunk : k
mn

In [52]:
docs = []
cov_ids = []

for _,x in out.iterrows():
    
    lines = x.conversation.split("\n")
    for i in range(0,len(lines),6):
        docs.append("\n".join(lines[i:i+6]).strip("\n"))
        cov_ids.append(str(x.conv_id))


In [53]:
len(docs),len(cov_ids)

(409, 409)

In [54]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection")
collection.add(
    documents=docs,
    ids=cov_ids
)

Using embedded DuckDB without persistence: data will be transient
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


# Few short prompting

In [70]:
def rag_pipeline(question):
    results = collection.query(query_texts=[question],n_results=2)
    
    prompt = f"<s> [INST] Aswer the following question in few words \n \
    Question: {question} \n Basesd on the Context:\
    {results['documents'][0][0]} \n {results['documents'][0][1]}  [/INST]\n Answer: "
    
    seq = pipeline(prompt) 
    x = seq[0]['generated_text'].split(prompt)
    
    return (x[1],",".join(results['ids'][0]).strip(','))

In [68]:
qa_pair = pd.read_csv("dataset/gen_qa_pair.csv")

In [71]:
gen_ans = []
relevent_cov = []

for _,r in tqdm(qa_pair.iterrows()):
    q = r.Question.strip("<question> ")
    ans,rel_conv = rag_pipeline(q)
    gen_ans.append(ans)
    relevent_cov.append(rel_conv)

qa_pair['gen_ans'] = gen_ans
qa_pair['retrived_cov_ids'] = relevent_cov

qa_pair.to_csv("dataset/gen_qa_ans.csv",index=False)

97it [04:45,  2.94s/it]


## Evalution

In [2]:
from evaluate import load

rouge = load('rouge')
bleu = load('google_bleu')
bertscore = load('bertscore')

In [3]:
df = pd.read_csv("dataset/gen_qa_ans.csv")

In [4]:
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,Question,Answer,Conv_id,gen_ans,retrived_cov_ids
0,<question> Do you like dance?,"<ans> Yes, I do.",2,"Yes, I like dance. Bruce Lee was a dancer too...",561
1,<question> Did you know Bruce Lee was a cha ch...,"<ans> Yes, he even won a hardcore cha cha cham...",2,Bruce Lee was a cha cha dancer and won a cham...,264
2,<question> Did you know Tupac was a ballet dan...,"<ans> Yes, and he even was in the production o...",2,"Tupac was a ballet dancer? No, I didn't know ...",642
3,<question> Do ballet dancers go through a lot ...,"<ans> Yes, they go through 4 pairs of shoes a ...",2,"Yes, ballet dancers go through many shoes, of...",24
4,<question> Did you know babies are good at dan...,"<ans> Yes, and they smile more when they hit t...",2,"Yes, babies are good at dancing. They spontan...",652


In [5]:
from tqdm import tqdm

In [10]:
def evaluate_QA():
    
    # Evaluate retrieval 
    
    # Evaluate Generation
    rouge_score,bleu_score,bert_score = 0,0,0
    for _,x in tqdm(df.iterrows()):
        ref = [x.Answer.strip("<ans> ")]
        pred = [x.gen_ans]
        rouge_score +=rouge.compute(references=ref,predictions=pred)['rougeL']
        # bleu_score += bleu.compute(references=ref,predictions=pred)['google_bleu']
        out = bertscore.compute(references=ref,predictions=pred, lang="en")
        bert_score += out['f1'][0]
    print(f"The evalutation of are as follows :\n*RougeL: {rouge_score/len(df)} \n*Bleu: {bleu_score/len(df)} \n*BertScore: {bert_score/len(df)}")
evaluate_QA()

97it [00:22,  4.38it/s]

The evalutation of are as follows :
*RougeL: 0.2287766904620272 
*Bleu: 0.09985928717320809 
*BertScore: 0.8748656597334085





In [None]:
Qestion Gen
Cov 2 ---> QA


Cov 5,61,2 ---> A

# Try different chunikng whole context , Overlap

In [None]:
RougeL : 50+ human level 
BertScore: 

# Future Works