In [58]:
# Validation with Base Model 
#Load the Fine-Tuned Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer from the saved directory
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# Example usage for a question and answer task
def answer_question(question):
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model.generate(inputs.input_ids)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example question
question = "What is a correlated anomaly?"
print(answer_question(question))

a symbiotic relationship


In [8]:
from datasets import load_dataset , DatasetDict, Dataset 
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM, DataCollatorWithPadding, TrainingArguments, Trainer, 
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig 
import evaluate
import torch
import numpy as np 
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
# Step 1: Load the CSV Data

In [11]:
import pandas as pd

# Load the custom data from a CSV file
data = pd.read_csv("./FineTuning/output/qa_pairs.csv")

# Check the first few rows of the dataset
print(data.head())

                                            question  \
0  What is the topic of section 'Anomalies' in th...   
1  What are the sub-topics covered under 'Anomali...   
2  What are the different properties that can be ...   
3  How many lines does the text contain about ano...   
4    What are the main topics discussed in the text?   

                                              answer  
0  The topic of section 'Anomalies' in the Cisco ...  
1  The 'Anomalies' section in the Cisco Nexus Das...  
2  Anomaly properties include filters, global rul...  
3     The text contains 2 lines about anomaly rules.  
4  The text discusses Anomaly rules, guidelines a...  


In [None]:
#Step 2: Create a Dataset

In [28]:
# Convert the DataFrame to a Dataset object
dataset = Dataset.from_pandas(data)
# Example function to preprocess data
def preprocess_data(examples):
    return {
        "input_text": ["Question: " + question for question in examples["question"]],
        "target_text": ["Answer: " + answer for answer in examples["answer"]]
    }

# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess_data, batched=True, remove_columns=["question", "answer"])

dataset

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 86
})

In [29]:
#Step 3: Fine-Tune the Model

In [30]:
# Load tokenizer and model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [35]:
# Tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["input_ids"] = model_inputs["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])
tokenized_dataset

Map:   0%|          | 0/86 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 86
})

In [48]:
# Data collator to handle padding
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    predict_with_generate=True,
    remove_unused_columns=False,
    gradient_accumulation_steps=4,
    no_cuda=True  # Force training on CPU
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
0,No log,1.876881
1,No log,1.826055
2,No log,1.77704
3,No log,1.728743
4,No log,1.681937
5,No log,1.639316
6,No log,1.598396
7,No log,1.557857
8,No log,1.520058
9,No log,1.48286


TrainOutput(global_step=1000, training_loss=5.2360244140625, metrics={'train_runtime': 8048.9961, 'train_samples_per_second': 1.068, 'train_steps_per_second': 0.124, 'total_flos': 250912747837440.0, 'train_loss': 5.2360244140625, 'epoch': 99.93023255813954})

In [49]:
#Save the model and tokenizer to a local directory
model.save_pretrained("./FineTune/flan-t5-base-NDI/")
tokenizer.save_pretrained("./FineTune/flan-t5-base-NDI/")


('./FineTune/flan-t5-base-NDI/tokenizer_config.json',
 './FineTune/flan-t5-base-NDI/special_tokens_map.json',
 './FineTune/flan-t5-base-NDI/tokenizer.json')

In [50]:
#Load the Fine-Tuned Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer from the saved directory
model = AutoModelForSeq2SeqLM.from_pretrained("./FineTune/flan-t5-base-NDI/")
tokenizer = AutoTokenizer.from_pretrained("./FineTune/flan-t5-base-NDI/")

# Example usage for a question and answer task
def answer_question(question):
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model.generate(inputs.input_ids)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example question
question = "What is a correlated anomaly?"
print(answer_question(question))

A correlated anomaly is an anomaly that occurred because of another anomaly.


In [56]:
from huggingface_hub import interpreter_login
interpreter_login()
model.push_to_hub('flanT5base-NDI')


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  Y


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AnanthMeka/flanT5base-NDI/commit/c620d0f53f517032effdbc8ed2e2ccba5cd1e55d', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='c620d0f53f517032effdbc8ed2e2ccba5cd1e55d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AnanthMeka/flanT5base-NDI', endpoint='https://huggingface.co', repo_type='model', repo_id='AnanthMeka/flanT5base-NDI'), pr_revision=None, pr_num=None)

a symbiotic relationship
