# Importing libraries

In [2]:
import torch
import os
import warnings
import platform

import pandas as pd
import numpy as np

from bert_score import score
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from datasets import Dataset
from evaluate import load

# Defining file paths

In [3]:
OUTPUT_DIR = "results"
LOG_DIR = "logs"
TRAIN_FILE = "dataset/train.txt"  # Path to your training data file
VAL_FILE = "dataset/validation.txt"  # Path to your validation data file
TEST_FILE = "dataset/test.txt"  # Path to your test data file

# Loading Pre-trained Model

In [4]:
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
# model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")

In [18]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [19]:
original_vocab_size = model.config.vocab_size
print(f"Original vocab size: {original_vocab_size}")

Original vocab size: 151936


In [20]:
print(f"Model max length: {model.config.max_position_embeddings}")

Model max length: 32768


In [21]:
print(f"Max sequence length: {tokenizer.model_max_length}")

Max sequence length: 131072


In [22]:
model.gradient_checkpointing_enable()
model.resize_token_embeddings(len(tokenizer))

Embedding(151665, 896)

# Setting the device

In [10]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [23]:
def count_tokens(file_path, tokenizer):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# Count tokens in training and validation files
train_token_count = count_tokens(TRAIN_FILE, tokenizer)
eval_token_count = count_tokens(VAL_FILE, tokenizer)
print(f"Train Tokens: {train_token_count}")
print(f"Validation Tokens: {eval_token_count}")

Train Tokens: 22870
Validation Tokens: 6054


# Training the model

## Setting up training arguments

In [25]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    logging_strategy="epoch",     # Logs loss at intervals
    learning_rate=1e-5,
    per_device_train_batch_size=1,  # Reduced batch size for limited GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=LOG_DIR,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
    fp16=True,
    bf16=False,
    optim="adamw_torch"
)

## Loading dataset

In [26]:
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("user:"):
                user_input = line.replace("user:", "").strip()
            elif line.startswith("bot:"):
                bot_response = line.replace("bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

In [27]:
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

print(f"Length of training dataset: {len(df_train)}")
print(f"Length of validation dataset: {len(df_val)}")

Length of training dataset: 282
Length of validation dataset: 71


## Tokenizing dataset

In [28]:
def tokenize_function(examples):
    inputs = [f"{inp} {out}" for inp, out in zip(examples["input"], examples["output"])]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    model_inputs["labels"] = model_inputs["input_ids"].clone() 
    # model_inputs["labels"] = torch.tensor(model_inputs["input_ids"]).to("cpu")
    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

## Data collator 

In [29]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked language modeling is not used for causal LM
)

## Compute Metrics

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("roberta-large")
# accuracy_metric = load("accuracy")

# def compute_bertscore(references, candidates, model_type="roberta-large"):
#     """Compute BERTScore for chatbot responses."""
#     P, R, F1 = score(candidates, references, model_type=model_type, lang="en", rescale_with_baseline=True)
#     return {
#         "Precision": P.mean().item(),
#         "Recall": R.mean().item(),
#         "F1 Score": F1.mean().item()
#     }

# def compute_metrics(eval_pred):
#     """Compute accuracy, BERTScore, for evaluation."""
#     predictions, labels = eval_pred

#     # Ensure predictions and labels are in the correct format
#     predictions = np.argmax(predictions, axis=-1).tolist()
#     labels = labels.tolist()

#     # Decode model outputs
#     decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
#     decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

#     # Compute accuracy
#     accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

#     # Compute BERTScore (similarity of response to ground truth)
#     bert_scores = compute_bertscore(decoded_labels, decoded_preds)

#     # Combine metrics
#     final_metrics = {
#         "Accuracy": accuracy["accuracy"],
#         **bert_scores,
#     }

#     print("\nEvaluation Metrics:", final_metrics)
#     return final_metrics

In [30]:
from transformers import TrainerCallback

class LogTrainingLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        logs = state.log_history
        train_loss = None

        # Extract latest training loss
        for log in reversed(logs):
            if "loss" in log:
                train_loss = log["loss"]
                break  # Get the last recorded loss

        if train_loss is not None:
            print(f"Epoch {state.epoch}: Training Loss = {train_loss}")

## Initialising the model

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    callbacks=[LogTrainingLossCallback()],  # Attach custom callback
)

trainer.train()

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,1.7675,1.595497
2,0.9245,1.651783
3,0.4681,1.862923


Epoch 2.0: Training Loss = 1.7675
Epoch 3.0: Training Loss = 0.9245


KeyboardInterrupt: 

## Saving Trained Model

In [None]:
model.save_pretrained("results")
tokenizer.save_pretrained("results")

# Evaluating the model

## Load trained model

In [None]:
model = AutoModelForCausalLM.from_pretrained("results", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results", device_map="auto")

model.to(device)

In [None]:
# def chatbot_response(user_input, chatbot):
#     sentiment = get_sentiment(user_input)  # Get sentiment from BERT

#     # Define chatbot responses based on sentiment
#     responses = {
#         "positive": "I'm happy to hear that! Keep up the great energy. 😊",
#         "neutral": "That sounds interesting. Tell me more, I'm here to listen. 🧡",
#         "negative": "I'm sorry you're feeling this way. You're not alone—would you like some self-care tips? 💙"
#     }

#     # Modify chatbot input based on sentiment
#     modified_input = f"{user_input} [Sentiment: {sentiment.capitalize()}]"

#     # Get chatbot response
#     bot_reply = chatbot.generate_response(modified_input)  # Assuming chatbot has a generate_response method

#     # Modify response based on sentiment if needed
#     if sentiment == "negative":
#         bot_reply += " Remember, you're not alone. Would you like some coping tips? 💙"

#     return bot_reply

In [None]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh")
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
def get_sentiment(text):
    """Improved sentiment analysis with keyword-based distress detection."""
    crisis_keywords = ["end my life", "suicide", "don't want to live", "kill myself", "worthless", "no reason to live"]

    # Check if crisis words are in the input
    if any(phrase in text.lower() for phrase in crisis_keywords):
        return "crisis"  # Override sentiment if crisis words are detected

    # Otherwise, use DistilBERT-based sentiment analysis
    result = sentiment_classifier(text)[0]
    label = result['label']

    # Convert to sentiment categories based on DistilBERT outputs
    if label == "NEGATIVE":
        return "negative"
    elif label == "POSITIVE":
        return "positive"
    else:
        return "neutral"

In [None]:
def chatbot_response(prompt, max_length=200):
    system_prompt = "You are a helpful and supportive chatbot. Answer the user's question in a clear and concise way without repeating their words exactly."
    full_prompt = f"{system_prompt}\nUser: {prompt}\nBot:"

    sentiment_results = get_sentiment(prompt)

    inputs = tokenizer(full_prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    outputs = model.generate(
        **inputs, 
        max_length=max_length,
        repetition_penalty=1.3,  # Penalize repeated words/phrases
        no_repeat_ngram_size=3,  # Prevent 3-word repetition
        temperature=0.7,  # Lower randomness to keep responses meaningful
        top_p=0.9,  # Diverse sampling to avoid looping responses
        top_k=50  # Limits token choices to avoid repetition
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the response does not include the system prompt
    response = response.replace(system_prompt, "").strip()
    
    # Remove any leftover system prompt instructions
    if "Bot:" in response:
        response = response.split("Bot:")[-1].strip()

    # Translate response to Chinese
    translated = pipe(response)[0]['translation_text']

    return response, translated, sentiment_results

In [None]:
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        response, translated, sentiment_results = chatbot_response(user_input)
        print(f"User: {user_input}")
        print(f"Bot: {response}")
        print(f"Translated Text: {translated}")
        print(f"Sentiment Results: {sentiment_results}")