# 1. GitHub Clone

In [1]:
!git clone https://github.com/Falgun1/NLP-Corpus
%cd NLP-Corpus/Pipeline

Cloning into 'NLP-Corpus'...
remote: Enumerating objects: 1421, done.[K
remote: Counting objects: 100% (520/520), done.[K
remote: Compressing objects: 100% (309/309), done.[K
remote: Total 1421 (delta 417), reused 284 (delta 211), pack-reused 901[K
Receiving objects: 100% (1421/1421), 2.88 MiB | 27.85 MiB/s, done.
Resolving deltas: 100% (693/693), done.
/kaggle/working/NLP-Corpus/Pipeline


# 2. Library

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

In [6]:
%%capture
!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q nltk
!pip install -q datasets
!pip install -q textstat
!pip install -q rouge_score
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [7]:
import torch,os, json, re, random  
import bitsandbytes as bnb
import torch.nn as nn
import pandas as pd
from pprint import pprint
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling)
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig
from huggingface_hub import login
from trl import SFTTrainer
from keywords_manager import KeywordsManager
from wiki import WikiArticleFetcher, FilteredWikiArticleFetcher
from file_utils import ZipExtractor
from generator import QuestionGenerator, print_qa
from question_generator import QuestionAnswerGenerator

# 3.Web Scraping

In [8]:
def data_collector():
    wscraping = FilteredWikiArticleFetcher(keywords_manager=KeywordsManager(),file_limit=5,filtered_names = ['wiki_CNSC'] )
    wscraping.fetch_and_save_articles()  
if __name__ == "__main__":
    data_collector()

Articles found: 5
Article limit reached. Stopping the process.

Total articles found and added to ZIP: 5
Total time taken: 102.96 seconds


In [9]:
def zip_extractor():
    extractor = ZipExtractor(zip_path = 'filtered_articles.zip', extract_to = 'Articles')
    extractor.extract()
if __name__ == "__main__":
    zip_extractor()  
def list_files_in_directory(directory):
    if not os.path.exists(directory):
        print(f"The directory {directory} does not exist.")
        return []
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return files
def print_files():
    directory = 'Articles'
    files = list_files_in_directory(directory)
    if files:
        print(f"Files in '{directory}' directory:")
        for file in files:
            print(file)
    else:
        print("No files found.")
if __name__ == "__main__":
    print_files()

Extracted filtered_articles.zip to Articles
Files in 'Articles' directory:
wiki_CNSC_Bruce_Nuclear_Generating_Station.txt
wiki_CNSC_Chalk_River_Laboratories.txt
wiki_CNSC_National_Research_Universal_reactor.txt
wiki_CNSC_Whiteshell_Laboratories.txt
wiki_CNSC_Canadian_Nuclear_Safety_Commission.txt


# 4.Q&A Generator

In [10]:
def main():  
    qag = QuestionAnswerGenerator(articles_folder = "Articles" , num_questions = 20, answer_style = 'all')
    qag.generate_questions()
if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Generating questions...





Evaluating QA pairs...

1) Q: What was the amount of water released from the NRU reactor?
   A: In its formal report to the CNSC, filed on December 9, 2008 (when the volume of leakage was determined to meet the requirement for such a report) AECL mentioned that 47 litres (10 imp gal; 12 US gal) of heavy water were released from the reactor, about 10% of which evaporated and the rest contained, but affirmed that the spill was not serious and did not present a threat to public health.

2) Q: What was the first time that a medical isotope was produced in nature?
   A: With the construction of the earlier NRX reactor, it was possible for the first time to commercially manufacture isotopes that were not commonly found in nature.

3) Q: How many workers were exposed to radiation during the refurbishment?
   A: In January 2010, up to 217 workers were potentially exposed to radiation during the refurbishment.

4) Q: What was the government's response to the shutdown of the NRU reactor?
   A: O

In [12]:
# def read_json_file(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         data = json.load(file)
#     return data

# def print_json_content(file_path):
#     data = read_json_file(file_path)
#     print(f"Content of {file_path}:")
#     print(json.dumps(data, indent=4))  # Pretty-print the JSON data

# train_file_path = 'train.json'
# test_file_path = 'test.json'

# print_json_content(train_file_path)
# print_json_content(test_file_path)

# import json
# def count_records(file_path):
#     with open(file_path, 'r') as file:
#         data = json.load(file)
#     return len(data)
# def main():
#     train_file_path = 'train.json'
#     test_file_path = 'test.json' 
#     train_count = count_records(train_file_path)
#     test_count = count_records(test_file_path)  
#     print(f"Number of records in train.json: {train_count}")
#     print(f"Number of records in test.json: {test_count}")
# if __name__ == "__main__":
#     main()

Number of records in train.json: 16
Number of records in test.json: 4


# 5. Model Loader

In [6]:
HF_TOKEN = "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"
model_name = "meta-llama/Meta-Llama-3-8B"
max_seq_length = 2048

def load_model_and_tokenizer():
    """Load the model and tokenizer with configurations."""
    try:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        special_tokens = tokenizer.special_tokens_map_extended
        eos_token = tokenizer.eos_token
        eos_token_id = tokenizer.eos_token_id
        
        print("EOS Token:", eos_token)
        print("EOS Token ID:", eos_token_id)
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map='auto',  
            low_cpu_mem_usage=True,
            use_auth_token=HF_TOKEN
        )
        print("Applying gradient checkpointing and preparing for k-bit training...")
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        print("Model and tokenizer loaded and configured successfully.")
        return model, tokenizer
    except Exception as e:
        print("An error occurred:", e)
        return None, None

def apply_lora_config(model):
    """Apply LoRA configuration to the model."""
    try:
        print("Applying LoRA configuration...")
        lora_config = LoraConfig(
            r=16,
            lora_alpha=16,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model = get_peft_model(model, lora_config)
        print("LoRA configuration applied successfully.")
        return model
    except Exception as e:
        print("An error occurred while applying LoRA configuration:", e)
        return model
model, tokenizer = load_model_and_tokenizer()

if model and tokenizer:
    model = apply_lora_config(model)

Loading tokenizer...




tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


EOS Token: <|end_of_text|>
EOS Token ID: 128001
Loading model...




config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Applying gradient checkpointing and preparing for k-bit training...
Model and tokenizer loaded and configured successfully.
Applying LoRA configuration...
LoRA configuration applied successfully.


# 6. Dataset Converting to Appropriate Format for Huggingface Transformers

In [7]:
def add_questions_key(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r') as infile:
        data = json.load(infile)        
    # Format the data by adding the 'questions' key
    formatted_data = {
        "questions": data
    }

    # Write the formatted data to a new file
    with open(output_file, 'w') as outfile:
        json.dump(formatted_data, outfile, indent=4)
# File paths for JSON datasets
train_input_file = 'train.json'
train_output_file = 'train_dataset.json'

test_input_file = 'test.json'
test_output_file = 'test_dataset.json'

# Convert training and test datasets to the appropriate format
add_questions_key(train_input_file, train_output_file)
add_questions_key(test_input_file, test_output_file)

print("JSON files have been formatted and saved successfully.")

with open("test_dataset.json") as json_file:
    test = json.load(json_file)    
with open("train_dataset.json") as json_file:
    train = json.load(json_file)
pd.DataFrame(train["questions"]).head()
pd.DataFrame(test["questions"]).head()
pprint(train["questions"][0], sort_dicts=False)
pprint(test["questions"][0], sort_dicts=False)

JSON files have been formatted and saved successfully.
{'question': 'What was the reason for the shutdown?',
 'answer': 'Although the leakage was not a concern to the CNSC from a health, '
           'safety or environmental perspective, AECL made plans for a repair '
           'to reduce the current leakage rate for operational reasons.'}
{'question': 'What are the main themes of the comments on the draft '
             'environmental impact statement?',
 'answer': 'CNSC Disposition Table of Public and Indigenous Groups’ Comments '
           'on the Draft Environmental Impact Statement-WR-1> The main themes '
           'of these comments are Public and Aboriginal Consultation, CNSC '
           'Impartiality, and Decommissioning Waste Policies.'}


In [8]:
# Function to check data format
def check_data_format(data):
    if "questions" not in data or not isinstance(data["questions"], list):
        raise ValueError("The data does not contain the 'questions' key or it is not a list.")

check_data_format(train)
check_data_format(test)

# Define the prompt format
prompt = """Below is a question paired with an answer. Please write a response that appropriately completes the request.

### Question:
{}

### Answer:
{}"""

# Get special tokens and EOS token from the tokenizer
special_tokens = tokenizer.special_tokens_map_extended
eos_token = tokenizer.eos_token
eos_token_id = tokenizer.eos_token_id

# Function to format prompts
def formatting_prompts_func(examples):
    questions = examples["question"]
    answers = examples["answer"]
    texts = []
    for question, answer in zip(questions, answers):
        # Format the text according to the prompt and append eos_token
        text = prompt.format(question, answer) + eos_token
        texts.append(text)
    return {"text": texts}

# Function to convert data into dataset format
def create_and_format_dataset(data):
    dataset_dict = {
        "question": [item["question"] for item in data["questions"]],
        "answer": [item["answer"] for item in data["questions"]],
    }
    dataset = Dataset.from_dict(dataset_dict)
    # Apply the formatting prompts function and remove 'text' column
    dataset = dataset.map(formatting_prompts_func, batched=True)
    dataset = dataset.remove_columns(["text"])
    return dataset

# Create and format training and test datasets
train_dataset = create_and_format_dataset(train)
test_dataset = create_and_format_dataset(test)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})
# Check the formatted dataset
print(dataset_dict)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 16
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 4
    })
})


In [9]:
def preprocess_function(examples):
    # Tokenize the input texts
    inputs = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt')
    labels = tokenizer(examples['answer'], padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt')
    
    # Add labels to inputs
    inputs['labels'] = labels['input_ids']
    
    # Create attention masks for the inputs
    inputs['attention_mask'] = inputs['attention_mask']
    
    return inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)
# print(train_dataset[0])
# print(train_dataset)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

# 6. Training

In [10]:
login(token="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")
OUTPUT_DIR = "experiments"

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=10,
    output_dir=OUTPUT_DIR,
    max_steps=5,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
    evaluation_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

model.eval()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)
model.config.use_cache = False
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful




Step,Training Loss,Validation Loss


TrainOutput(global_step=5, training_loss=3.9242427825927733, metrics={'train_runtime': 244.7403, 'train_samples_per_second': 0.082, 'train_steps_per_second': 0.02, 'total_flos': 1846085324636160.0, 'train_loss': 3.9242427825927733, 'epoch': 1.25})

# 7. Model Save and Load

In [None]:
# def save_model_and_tokenizer(output_dir, model, tokenizer):
#     model.save_pretrained(output_dir)
#     tokenizer.save_pretrained(output_dir)
#     print(f"Model and tokenizer saved to {output_dir}")

# # Model and tokenizer save
# save_model_and_tokenizer(OUTPUT_DIR, model, tokenizer)

# def load_model_and_tokenizer(output_dir):
#     model = AutoModelForCausalLM.from_pretrained(output_dir)
#     tokenizer = AutoTokenizer.from_pretrained(output_dir)
#     print(f"Model and tokenizer loaded from {output_dir}")
#     return model, tokenizer
# loaded_model, loaded_tokenizer = load_model_and_tokenizer(OUTPUT_DIR)
# # Model evaluation mode
# loaded_model.eval()

In [13]:
trainer.save_model()

In [14]:
# Define the prompt format
prompt = """Below is a question paired with an answer. Please write a response that appropriately completes the request.

### Question:
{}

### Answer:
{}"""

def generate_answer(question):
    # Format the prompt with the question
    formatted_prompt = prompt.format(question, "")

    # Tokenize the formatted prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=2048,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = answer.split('### Answer:')[-1].strip()
    return answer

In [15]:
test_questions = [
    "What is the CNSC"
]

for question in test_questions:
    print(f"Question: {question}")
    print(f"Answer: {generate_answer(question)}\n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Question: What is the CNSC




Answer: The CNSC is the Canadian Nuclear Safety Commission. The CNSC is an independent federal government agency responsible for regulating the use of nuclear energy and materials to protect the health, safety and security of Canadians and the environment.



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, load_metric
from textstat.textstat import textstatistics

# Load metric
rouge = load_metric("rouge")

# Load model and tokenizer
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained("experiments")  

# Evaluate ROUGE scores
def evaluate_rouge(predictions, references):
    results = rouge.compute(predictions=predictions, references=references)
    return results

# Calculate readability complexity
def calculate_readability(text):
    complexity = textstatistics().flesch_reading_ease(text)
    return complexity

# Load TrueQA dataset
truthfulqa = load_dataset("truthfulqa")

# Evaluation function for TrueQA
def evaluate_truthfulqa(model, tokenizer, dataset):
    scores = []
    for item in dataset:
        question = item["question"]
        reference_answer = item["answer"]
        generated_answer = generate_answer(question, model, tokenizer)
        
        # Evaluate using ROUGE
        rouge_result = evaluate_rouge([generated_answer], [reference_answer])
        scores.append(rouge_result)
    return scores

# Evaluate on TrueQA validation dataset
truthfulqa_scores = evaluate_truthfulqa(model, tokenizer, truthfulqa["validation"])
print(truthfulqa_scores)

# MLM pipeline (Optional: If model supports fill-mask)
mlm_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)
masked_sentence = "The capital of [MASK] is Paris."
results = mlm_pipeline(masked_sentence)
print(results)


The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]