# Importing Libraries

In [1]:
import os
import torch
import warnings

import pandas as pd

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

warnings.filterwarnings('ignore')

KeyboardInterrupt: 

In [None]:
OUTPUT_DIR = "results"
LOG_DIR = "logs"

TRAIN_FILE = "dataset/train.txt"  # Path to your training data file
VAL_FILE = "dataset/val.txt"  # Path to your validation data file
TEST_FILE = "dataset/test.txt"  # Path to your training data file

# Loading Pre-Trained Model

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

In [None]:
# Load FLAN-T5
MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

model.gradient_checkpointing_enable()

model.config.resid_pdrop = 0.1  # Add dropout
model.config.embd_pdrop = 0.1
model.config.attn_pdrop = 0.1

model.resize_token_embeddings(len(tokenizer))

Embedding(32100, 768)

In [None]:
torch.mps.empty_cache()

In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
def count_tokens(file_path, tokenizer):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

train_token_count = count_tokens("dataset/train.txt", tokenizer)
eval_token_count = count_tokens("dataset/val.txt", tokenizer)

print(train_token_count)
print(eval_token_count)

Token indices sequence length is longer than the specified maximum sequence length for this model (1193 > 512). Running this sequence through the model will result in indexing errors


1193
246


# Training the model


## Setting up training arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduce batch size if necessary
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="logs",

    fp16=False,  # Ensure mixed precision is disabled
    bf16=False,  # Disable bf16 as well
    half_precision_backend="cpu"  # Force precision handling on CPU (not MPS)
)



## Loading the dataset

In [None]:
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("User:"):
                user_input = line.replace("User:", "").strip()
            elif line.startswith("Bot:"):
                bot_response = line.replace("Bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

# Prepare datasets
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

In [None]:
print(f"Length of train_dataset: {len(dataset_train)}")
print(f"Length of eval_dataset: {len(dataset_val)}")

Length of train_dataset: 16
Length of eval_dataset: 4


In [None]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["output"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    
    model_inputs = {key: torch.tensor(val).to(device) for key, val in model_inputs.items()}  

    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

## Initialising the trainer

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
torch.mps.empty_cache()

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [None]:
# Train the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,No log,35.789917
2,No log,33.581078
3,No log,32.79348


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=12, training_loss=34.244160970052086, metrics={'train_runtime': 38.6151, 'train_samples_per_second': 1.243, 'train_steps_per_second': 0.311, 'total_flos': 32865182023680.0, 'train_loss': 34.244160970052086, 'epoch': 3.0})

## Saving the model

In [None]:
model.save_pretrained("results/flan_t5_chatbot")
tokenizer.save_pretrained("results/flan_t5_chatbot")

('results/flan_t5_chatbot/tokenizer_config.json',
 'results/flan_t5_chatbot/special_tokens_map.json',
 'results/flan_t5_chatbot/tokenizer.json')

# Evaluation

In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)  # Move model to MPS

def chat_with_model(prompt, max_length=100):
    """Generate a response from the trained chatbot model."""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=1024)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    output = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    response = tokenizer.decode(output[0].cpu(), skip_special_tokens=True)
    
    return response

if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        bot_response = chat_with_model(user_input)
        print(f"User: {user_input}")
        print(f"Bot: {bot_response}")

Chatbot is ready! Type 'exit' to stop.
User: how do i cure depression?
Bot: eat a lot of fruits and vegetables.


In [None]:
def chatbot_response(prompt, max_length=100):
    # Tokenize input prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generate response
    outputs = model.generate(**inputs, max_length=max_length)

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [None]:
user_input = "how do i cure depression?"
response = chatbot_response(user_input)
print("Chatbot:", response)

Chatbot: eat a lot of fruits and vegetables.


# Model for translation

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")