# Importing libraries

In [21]:
import torch
import os
import warnings
import platform

import pandas as pd
import numpy as np

from bert_score import score
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from datasets import Dataset
from evaluate import load

# Defining file paths

In [22]:
OUTPUT_DIR = "results2"
LOG_DIR = "logs"
TRAIN_FILE = "dataset2/train.txt"  # Path to your training data file
VAL_FILE = "dataset2/validation.txt"  # Path to your validation data file
TEST_FILE = "dataset2/test.txt"  # Path to your test data file

# Loading Pre-trained Model

In [23]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [24]:
original_vocab_size = model.config.vocab_size
print(f"Original vocab size: {original_vocab_size}")

Original vocab size: 49152


In [25]:
print(f"Model max length: {model.config.max_position_embeddings}")

Model max length: 8192


In [26]:
print(f"Max sequence length: {tokenizer.model_max_length}")

Max sequence length: 8192


In [27]:
model.gradient_checkpointing_enable()
model.resize_token_embeddings(len(tokenizer))

Embedding(49152, 960, padding_idx=2)

# Setting the device

In [28]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [34]:
def count_tokens(file_path, tokenizer):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# Count tokens in training and validation files
train_token_count = count_tokens(TRAIN_FILE, tokenizer)
eval_token_count = count_tokens(VAL_FILE, tokenizer)
print(f"Train Tokens: {train_token_count}")
print(f"Validation Tokens: {eval_token_count}")

Train Tokens: 8285
Validation Tokens: 3615


# Training the model

## Setting up training arguments

In [36]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    logging_strategy="epoch",     # Logs loss at intervals
    learning_rate=5e-6,
    per_device_train_batch_size=1,  # Reduced batch size for limited GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=LOG_DIR,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
    fp16=True,
    bf16=False,
    optim="adamw_torch"
)

## Loading dataset

In [37]:
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("user:"):
                user_input = line.replace("user:", "").strip()
            elif line.startswith("bot:"):
                bot_response = line.replace("bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

In [38]:
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

print(f"Length of training dataset: {len(df_train)}")
print(f"Length of validation dataset: {len(df_val)}")

Length of training dataset: 100
Length of validation dataset: 43


## Tokenizing dataset

In [39]:
def tokenize_function(examples):
    inputs = [f"{inp} {out}" for inp, out in zip(examples["input"], examples["output"])]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    model_inputs["labels"] = model_inputs["input_ids"].clone() 
    # model_inputs["labels"] = torch.tensor(model_inputs["input_ids"]).to("cpu")
    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

## Data collator 

In [40]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked language modeling is not used for causal LM
)

## Compute Metrics

In [41]:
def compute_bertscore(eval_pred):
    predictions, labels = eval_pred
    
    # Decode tokenized predictions & labels into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BERTScore
    P, R, F1 = score(decoded_preds, decoded_labels, lang="en", rescale_with_baseline=True)
    
    return {"bert_score_f1": F1.mean().item()}


## Initialising the model

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_bertscore,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8582,1.947174
2,1.6788,1.888138
3,1.5738,1.861037
4,1.5239,1.848047
5,1.4811,1.84362


TrainOutput(global_step=125, training_loss=1.6231461486816405, metrics={'train_runtime': 2607.1216, 'train_samples_per_second': 0.192, 'train_steps_per_second': 0.048, 'total_flos': 483279667200000.0, 'train_loss': 1.6231461486816405, 'epoch': 5.0})

## Saving Trained Model

In [None]:
model.save_pretrained("results2")
tokenizer.save_pretrained("results2")

# Evaluating the model

## Load trained model

In [None]:
model = AutoModelForCausalLM.from_pretrained("results2", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results2", device_map="auto")

model.to(device)

In [None]:
# def chatbot_response(user_input, chatbot):
#     sentiment = get_sentiment(user_input)  # Get sentiment from BERT

#     # Define chatbot responses based on sentiment
#     responses = {
#         "positive": "I'm happy to hear that! Keep up the great energy. 😊",
#         "neutral": "That sounds interesting. Tell me more, I'm here to listen. 🧡",
#         "negative": "I'm sorry you're feeling this way. You're not alone—would you like some self-care tips? 💙"
#     }

#     # Modify chatbot input based on sentiment
#     modified_input = f"{user_input} [Sentiment: {sentiment.capitalize()}]"

#     # Get chatbot response
#     bot_reply = chatbot.generate_response(modified_input)  # Assuming chatbot has a generate_response method

#     # Modify response based on sentiment if needed
#     if sentiment == "negative":
#         bot_reply += " Remember, you're not alone. Would you like some coping tips? 💙"

#     return bot_reply

In [None]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh")
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
def get_sentiment(text):
    """Improved sentiment analysis with keyword-based distress detection."""
    crisis_keywords = ["end my life", "suicide", "don't want to live", "kill myself", "worthless", "no reason to live"]

    # Check if crisis words are in the input
    if any(phrase in text.lower() for phrase in crisis_keywords):
        return "crisis"  # Override sentiment if crisis words are detected

    # Otherwise, use DistilBERT-based sentiment analysis
    result = sentiment_classifier(text)[0]
    label = result['label']

    # Convert to sentiment categories based on DistilBERT outputs
    if label == "NEGATIVE":
        return "negative"
    elif label == "POSITIVE":
        return "positive"
    else:
        return "neutral"

In [None]:
def chatbot_response(prompt, max_length=200):
    system_prompt = "You are a helpful and supportive chatbot. Answer the user's question in a clear and concise way without repeating their words exactly."
    full_prompt = f"{system_prompt}\nUser: {prompt}\nBot:"

    sentiment_results = get_sentiment(prompt)

    inputs = tokenizer(full_prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    outputs = model.generate(
        **inputs, 
        max_length=max_length,
        repetition_penalty=1.3,  # Penalize repeated words/phrases
        no_repeat_ngram_size=3,  # Prevent 3-word repetition
        temperature=0.7,  # Lower randomness to keep responses meaningful
        top_p=0.9,  # Diverse sampling to avoid looping responses
        top_k=50  # Limits token choices to avoid repetition
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the response does not include the system prompt
    response = response.replace(system_prompt, "").strip()
    
    # Remove any leftover system prompt instructions
    if "Bot:" in response:
        response = response.split("Bot:")[-1].strip()

    # Translate response to Chinese
    translated = pipe(response)[0]['translation_text']

    return response, translated, sentiment_results

In [None]:
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        response, translated, sentiment_results = chatbot_response(user_input)
        print(f"User: {user_input}")
        print(f"Bot: {response}")
        print(f"Translated Text: {translated}")
        print(f"Sentiment Results: {sentiment_results}")