In [None]:
# !pip3 install transformers

In [None]:
# !pip3 install tqdm pandas

In [55]:
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import regex as re

from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

import mlflow.data
from mlflow.data.pandas_dataset import PandasDataset

# Load the LLM

In [2]:

# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

hf_model_repo = "meta-llama/Llama-2-7b-chat-hf"

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

In [59]:
all_csvs = [
    "mlops_project/text_data/text_train_2307.08621.csv"
]

arxivv = "2307.08621"

In [22]:
all_responsez_file = open("mlops_project/qna_data/generated_questions_and_answers.txt", 'w')
all_responsez = {"text": [], "response": []}

In [41]:
csvf = all_csvs[0]
text_chunks = pd.read_csv(csvf)

# indcies here need editing for new papers
# the whole text chunk of the paper
paper_text_chunks = text_chunks['text'][:94]
# references text chunks of the paper
refs_text_chunks = text_chunks['text'][95:]

# paper_text_chunks = paper_text_chunks[-5:]

print("paper_text_chunks: ", len(paper_text_chunks))


paper_text_chunks:  51


# Start generating Questions & Answeers

In [24]:


print("csvf = ", csvf)

ixcounter = -1


max_generate_count = 5 # only for testing

for txtch in tqdm(paper_text_chunks, f"processing_{csvf}"):
    
    ixcounter += 1
    if ixcounter > max_generate_count:
        break
    
    txtch = txtch.replace('-\n','')

    prompt = \
    f"""
    Instruction: Generate three question and answer pairs for the below text. All questions must start with "Question". All answers must start wtih "Answer"\n
    Input:\n
    {txtch}\n
    Output:\n
    """

    # Generate response
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
    outputs = model.generate(input_ids=input_ids,
                            max_new_tokens=200,
                            temperature=0.6)

    # gen_tokens; exclude input tokens from the final decoded output
    gen_tokens = outputs[:, input_ids.shape[1]:]
    response = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

    all_responsez["text"].append(txtch)
    all_responsez["response"].append(response)
    
    all_responsez_file.write(f"{response}\n++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    all_responsez_file.flush()

    # (running locally on RTX 2060) cool donw GPU before next run
    time.sleep(6)


csvf =  mlops_project/text_data/text_train_2307.08621.csv


processing_mlops_project/text_data/text_train_2307.08621.csv:  12%| | 6/51 [01:03<07:55, 1


In [26]:
qna_df_responsez = pd.DataFrame(all_responsez)
qna_df_responsez.to_csv("mlops_project/qna_data/generated_questions_and_answers.csv", index=False)

In [27]:
print(qna_df_responsez.head())

                                                text  \
0  Retentive Network: A Successor to Transformer\...   
1  ence, which improves decoding throughput, late...   
2  Figure 1: Retentive network (RetNet) achieves ...   
3  els [BMR+20], which was initially proposed\nto...   
4  “impossible triangle” as shown in Figure 2.\nT...   

                                            response  
0  Question 1: What is the main goal of the propo...  
1  Question 1: What does the chart in the output ...  
2  Question 1: What is the main advantage of RetN...  
3  Question 1: What was initially proposed to ove...  
4  Question 1: What is the main idea of the text?...  


In [29]:
print(qna_df_responsez.iloc[0]['response'])

Question 1: What is the main goal of the proposed architecture, RETNET?
Answer 1: The main goal of the proposed architecture, RETNET, is to achieve training parallelism, low-cost inference, and good performance simultaneously.

     Question 2: What is the connection between recurrence and attention, according to the authors?
Answer 2: The authors theoretically derive the connection between recurrence and attention, showing that recurrence can be viewed as a form of attention.

     Question 3: What is the retention mechanism proposed in the work, and what are its three computation paradigms?
Answer 3: The retention mechanism is a sequence modeling approach that supports three computation paradigms: parallel, recurrent, and chunkwise recurrent. These paradigms allow for training parallelism, low-cost inference, and effiient sequence modeling, respectively.


# Automatic data filtering

In [31]:
def get_QA_pairs(qa_df):
    pairs = {"Question": [], "Answer": []}
    
    pattern = re.compile(r"Question\s*\d*:\s*(.*?)\s*Answer\s*\d*:\s*(.*?)(?=\n\s*\d*\.\s*Question|\n\s*Question|\Z)", re.DOTALL)

    for qa_response in qa_df['response']:

        matches = pattern.findall(qa_response)

        for question, answer in matches:
            # print(f"Question: {question.strip()}")
            if answer:
                # print(f"Answer: {answer.strip()}")
                pairs["Question"].append(question)
                pairs["Answer"].append(answer)
            else:
                print("skipped: ", question)
    return pairs

In [32]:
qa_pairs = get_QA_pairs(qna_df_responsez)

In [46]:
list(zip(qa_pairs['Question'], qa_pairs['Answer']))

[('What is the main goal of the proposed architecture, RETNET?',
  'The main goal of the proposed architecture, RETNET, is to achieve training parallelism, low-cost inference, and good performance simultaneously.'),
 ('What is the connection between recurrence and attention, according to the authors?',
  'The authors theoretically derive the connection between recurrence and attention, showing that recurrence can be viewed as a form of attention.'),
 ('What is the retention mechanism proposed in the work, and what are its three computation paradigms?',
  'The retention mechanism is a sequence modeling approach that supports three computation paradigms: parallel, recurrent, and chunkwise recurrent. These paradigms allow for training parallelism, low-cost inference, and effiient sequence modeling, respectively.'),
 ('What does the chart in the output show?',
  'The chart in the output shows the scaling curves of RetNet and Transformer in terms of GPU memory, throughput, and latency.'),
 

In [47]:
len(qa_pairs['Question']), len(qa_pairs['Answer'])

(18, 18)

# Save datset locally

In [48]:
qna_dfb = pd.DataFrame(qa_pairs)
qna_dfb.to_csv("mlops_project/qna_data/qna_train.csv", index=False)

In [49]:
print(qna_dfb.head())

                                            Question  \
0  What is the main goal of the proposed architec...   
1  What is the connection between recurrence and ...   
2  What is the retention mechanism proposed in th...   
3            What does the chart in the output show?   
4  What is the main advantage of RetNet over Tran...   

                                              Answer  
0  The main goal of the proposed architecture, RE...  
1  The authors theoretically derive the connectio...  
2  The retention mechanism is a sequence modeling...  
3  The chart in the output shows the scaling curv...  
4  The main advantage of RetNet over Transformer ...  


# Paper References

In [50]:
refs_qa_pairs = {"Question":[], "Answer":[]}

for ix, ref in enumerate(refs_text_chunks.values):
    q = f"What is reference [{ix+1}]?"
    refs_qa_pairs["Question"].append(q)
    refs_qa_pairs["Answer"].append(ref)

ref_qna_df = pd.DataFrame(refs_qa_pairs)
ref_qna_df.to_csv("mlops_project/qna_data/qna_train_Ref.csv", index=False)

In [51]:
import pandas as pd

df = pd.read_csv("mlops_project/qna_data/qna_train.csv")

df['text_question'] = 'Question:\n' + df['Question']
df['text_answer'] = 'Answer:\n' + df['Answer']

In [52]:
print(df['text_answer'].iloc[0])

Answer:
The main goal of the proposed architecture, RETNET, is to achieve training parallelism, low-cost inference, and good performance simultaneously.


# Version dataset with MLflow

In [57]:
csv_data_path = "mlops_project/qna_data/qna_train.csv"
dataset: PandasDataset = mlflow.data.from_pandas(df, source=csv_data_path)


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [60]:

with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(dataset, context="qna_training", tags={f'arxiv': arxivv})


# Retrieve the run, including dataset information


In [61]:
run_id = mlflow.last_active_run().info.run_id
run_id

'8fb8341b0e0d4e37ac48481d94b33021'

In [62]:
run = mlflow.get_run(mlflow.last_active_run().info.run_id)
dataset_info = run.inputs.dataset_inputs[0].dataset
print(f"Dataset name: {dataset_info.name}")
print(f"Dataset digest: {dataset_info.digest}")
print(f"Dataset profile: {dataset_info.profile}")
print(f"Dataset schema: {dataset_info.schema}")


Dataset name: dataset
Dataset digest: a6fee9ff
Dataset profile: {"num_rows": 18, "num_elements": 72}
Dataset schema: {"mlflow_colspec": [{"type": "string", "name": "Question", "required": true}, {"type": "string", "name": "Answer", "required": true}, {"type": "string", "name": "text_question", "required": true}, {"type": "string", "name": "text_answer", "required": true}]}


In [63]:
run.inputs.dataset_inputs[0].dataset

<Dataset: digest='a6fee9ff', name='dataset', profile='{"num_rows": 18, "num_elements": 72}', schema=('{"mlflow_colspec": [{"type": "string", "name": "Question", "required": '
 'true}, {"type": "string", "name": "Answer", "required": true}, {"type": '
 '"string", "name": "text_question", "required": true}, {"type": "string", '
 '"name": "text_answer", "required": true}]}'), source='{"uri": "mlops_project/qna_data/qna_train.csv"}', source_type='local'>

In [64]:
dataset_source = mlflow.data.get_source(dataset_info)
dataset_source.to_dict()

{'uri': 'mlops_project/qna_data/qna_train.csv'}

# Load data from the run_id

In [65]:
versioned_df = pd.read_csv(dataset_source.uri)

In [66]:
print(versioned_df.head())

                                            Question  \
0  What is the main goal of the proposed architec...   
1  What is the connection between recurrence and ...   
2  What is the retention mechanism proposed in th...   
3            What does the chart in the output show?   
4  What is the main advantage of RetNet over Tran...   

                                              Answer  
0  The main goal of the proposed architecture, RE...  
1  The authors theoretically derive the connectio...  
2  The retention mechanism is a sequence modeling...  
3  The chart in the output shows the scaling curv...  
4  The main advantage of RetNet over Transformer ...  
