# Importing libraries

In [24]:
import torch
import os
import psycopg2
import evaluate

import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from dotenv import load_dotenv

# Defining file paths

In [2]:
OUTPUT_DIR = "results"
LOG_DIR = "logs"
TRAIN_FILE = "dataset/train.txt"  # Path to your training data file
VAL_FILE = "dataset/validation.txt"  # Path to your validation data file
TEST_FILE = "dataset/test.txt"  # Path to your test data file

# Loading Pre-trained Model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")

In [4]:
# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")

In [5]:
original_vocab_size = model.config.vocab_size
print(f"Original vocab size: {original_vocab_size}")

Original vocab size: 49152


In [6]:
print(f"Model max length: {model.config.max_position_embeddings}")

Model max length: 8192


In [7]:
print(f"Max sequence length: {tokenizer.model_max_length}")

Max sequence length: 8192


In [8]:
model.gradient_checkpointing_enable()
model.resize_token_embeddings(len(tokenizer))

Embedding(49152, 960, padding_idx=2)

In [9]:
# model.config.use_cache = False

# Setting the device

In [10]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [11]:
def count_tokens(file_path, tokenizer):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# Count tokens in training and validation files
train_token_count = count_tokens(TRAIN_FILE, tokenizer)
eval_token_count = count_tokens(VAL_FILE, tokenizer)
print(f"Train Tokens: {train_token_count}")
print(f"Validation Tokens: {eval_token_count}")

Token indices sequence length is longer than the specified maximum sequence length for this model (23489 > 8192). Running this sequence through the model will result in indexing errors


Train Tokens: 23489
Validation Tokens: 6160


# Training the model

## Setting up training arguments

In [12]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    logging_strategy="epoch",     # Logs loss at intervals
    learning_rate=1e-5,
    per_device_train_batch_size=2,  # Reduced batch size for limited GPU memory
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=LOG_DIR,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
    fp16=True,
    bf16=False,
    optim="adamw_torch",
)

## Loading dataset

In [13]:
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("user:"):
                user_input = line.replace("user:", "").strip()
            elif line.startswith("bot:"):
                bot_response = line.replace("bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

In [14]:
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

print(f"Length of training dataset: {len(df_train)}")
print(f"Length of validation dataset: {len(df_val)}")

Length of training dataset: 278
Length of validation dataset: 70


## Tokenizing dataset

In [15]:
def find_max_interaction_length(dataset):
    length_distribution = []

    for user_msg, bot_msg in zip(dataset["input"], dataset["output"]):
        # Combine user message and bot response
        text_pair = f"User: {user_msg} Bot: {bot_msg}"

        # Tokenize
        tokenized = tokenizer(text_pair, truncation=False, padding=False)

        # Get the token length of this interaction
        num_tokens = len(tokenized["input_ids"])
        length_distribution.append(num_tokens)

    # Find the maximum token length
    max_length = max(length_distribution)

    return max_length, length_distribution

dataset = load_chatbot_data("dataset/dataset.txt")
max_len, token_lengths = find_max_interaction_length(dataset)

print(f"Max token length in dataset: {max_len}")
print(f"Token length distribution (first 10 samples): {token_lengths[:10]}")

Max token length in dataset: 146
Token length distribution (first 10 samples): [99, 97, 72, 82, 48, 75, 67, 70, 63, 57]


In [16]:
# def tokenize_function(examples):
#     inputs = [f"{inp} {out}" for inp, out in zip(examples["input"], examples["output"])]
#     model_inputs = tokenizer(
#         inputs,
#         max_length=1024,
#         truncation=True,
#         padding="max_length",
#         return_tensors="pt",
#     )
#     model_inputs["labels"] = model_inputs["input_ids"].clone()

#     num_tokens = len(model_inputs["input_ids"][0])
#     print(f"Number of tokens: {num_tokens}")
#     return model_inputs

# # Apply tokenization
# tokenized_train = dataset_train.map(tokenize_function, batched=True)
# tokenized_val = dataset_val.map(tokenize_function, batched=True) 

In [17]:
def tokenize_function(examples):
    model_inputs = {
        "input_ids": [],
        "attention_mask": [],
        "labels": [],
    }

    for user_msg, bot_msg in zip(examples["input"], examples["output"]):
        text_pair = f"User: {user_msg} Bot: {bot_msg}"
        tokenized = tokenizer(
            text_pair,
            max_length=200,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        # Store tokenized outputs
        model_inputs["input_ids"].append(tokenized["input_ids"].squeeze(0))
        model_inputs["attention_mask"].append(tokenized["attention_mask"].squeeze(0))

        # Labels for training: Shift left for causal language modeling
        labels = tokenized["input_ids"].clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss
        model_inputs["labels"].append(labels.squeeze(0))

    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [None]:
print(f"Total training samples: {len(tokenized_train)}")
print(f"Total validation samples: {len(tokenized_val)}")

Total training samples: 278
Total validation samples: 70


## Data collator 

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked language modeling is not used for causal LM
)

## Compute Metrics

## Initialising the model

In [20]:
# Trainer.processing_class = tokenizer.__class__

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
0,1.8426,1.652153
1,1.5345,1.59175
2,1.4175,1.569976
3,1.3429,1.562569
4,1.2696,1.560466


TrainOutput(global_step=170, training_loss=1.4814085118910845, metrics={'train_runtime': 2163.666, 'train_samples_per_second': 0.642, 'train_steps_per_second': 0.079, 'total_flos': 519525642240000.0, 'train_loss': 1.4814085118910845, 'epoch': 4.9784172661870505})

## Saving Trained Model

In [21]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) 

('results\\tokenizer_config.json',
 'results\\special_tokens_map.json',
 'results\\vocab.json',
 'results\\merges.txt',
 'results\\added_tokens.json',
 'results\\tokenizer.json')

# Evaluating the model

In [25]:
file_path = TEST_FILE
with open(file_path, "r", encoding="utf-8") as file:
    chat_data = file.readlines()

# Extract user inputs and reference responses
user_inputs = []
reference_responses = []

for i in range(len(chat_data) - 1):
    if chat_data[i].startswith("user:"):
        user_text = chat_data[i].replace("user:", "").strip()
        ref_text = chat_data[i + 1].replace("bot:", "").strip() if chat_data[i + 1].startswith("bot:") else None
        
        if ref_text:  # Ensure reference answer exists
            user_inputs.append(user_text)
            reference_responses.append(ref_text)

# Convert to DataFrame
df = pd.DataFrame({"input": user_inputs, "reference_response": reference_responses})

model = AutoModelForCausalLM.from_pretrained("results", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results", device_map="auto")

model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

In [32]:
print(device)

cuda


In [None]:
def chatbot_response(prompt):
    system_prompt = "You are a helpful and supportive chatbot. Answer the user's question with empathy, and in a clear and concise way without repeating their words exactly."
    full_prompt = f"{system_prompt}\nUser: {prompt}\nBot:"

    inputs = tokenizer(full_prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    outputs = model.generate(
        **inputs, 
        max_length=300,
        repetition_penalty=1.3,
        no_repeat_ngram_size=3,  
        temperature=0.8,  
        top_p=0.9,  #
        top_k=50  
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the response does not include the system prompt
    response = response.replace(system_prompt, "").strip()
    
    # Remove any leftover system prompt instructions
    if "Bot:" in response:
        response = response.split("Bot:")[-1].strip()

    return response

df["model_response"] = df["input"].apply(chatbot_response)




In [47]:
df

Unnamed: 0,input,reference_response,model_response,BERTScore_Model,BERTScore_Reference,BERTScore_Difference,Empathy_Score
0,Where can I learn more about caregiving skills?,Many organizations offer classes on caregiving...,You could start by looking up online courses o...,1.000000,0.632784,0.367216,0.000001
1,I'm anxious about my upcoming oral examination...,"Oral exams can be nerve-wracking, especially i...",Preparing your mind is just as important as pr...,1.000000,0.630308,0.369692,0.000034
2,I'm questioning my sexual orientation and I'm ...,Questioning your sexual orientation can be a c...,"It takes courage to acknowledge your doubts, b...",0.843764,0.648327,0.195437,0.000006
3,What kind of intervention and emotional suppor...,"They offer individual therapy (CBT, DBT, Emoti...","I'm sorry, but as an AI assistant designed to ...",0.590156,0.606884,0.016728,0.000002
4,I'm worried about my future career prospects i...,It's natural to feel concerned about your futu...,As someone who is concerned with your potentia...,1.000000,0.690351,0.309649,0.000001
...,...,...,...,...,...,...,...
56,How do I handle rejection without feeling wort...,"Rejection is painful, but it doesn't define yo...","Rejection can be really tough, but remember th...",1.000000,0.570181,0.429819,0.000003
57,I'm feeling overwhelmed by climate anxiety and...,Climate anxiety is a very real and valid conce...,Climate change is indeed an overwhelming topic...,0.841579,0.584615,0.256964,0.000001
58,How do I heal from past emotional wounds?,"Healing takes time, and it's okay to move at y...","Healing emotionally can take time, but it is p...",1.000000,0.591279,0.408720,0.000001
59,What are some of the specific symptoms someone...,"Schizophrenia can manifest in various ways, in...",Common signs include hallucinations (hearing o...,1.000000,0.598671,0.401329,0.000049


In [48]:
# **🔹 STEP 4: Load BERTScore Metric**
bertscore = evaluate.load("bertscore")

def compute_bertscore(predictions, references):
    results = bertscore.compute(predictions=predictions, references=references, model_type="microsoft/deberta-xlarge-mnli")
    return results["f1"]  # Extract F1 BERTScore

# Compute BERTScores
df["BERTScore_Model"] = compute_bertscore(df["model_response"].tolist(), df["input"].tolist())  # Model's response vs Input
df["BERTScore_Reference"] = compute_bertscore(df["reference_response"].tolist(), df["input"].tolist())  # Reference vs Input

# **🔹 STEP 6: Compute BERTScore Difference**
df["BERTScore_Difference"] = abs(df["BERTScore_Model"] - df["BERTScore_Reference"])  # Absolute Deviation

In [49]:
# **🔹 STEP 7: Load Empathy Model (RoBERTa-based classifier)**
empathy_model_name = "unitary/unbiased-toxic-roberta"
empathy_tokenizer = AutoTokenizer.from_pretrained(empathy_model_name)
empathy_model = AutoModelForSequenceClassification.from_pretrained(empathy_model_name)
empathy_pipeline = pipeline("text-classification", model=empathy_model, tokenizer=empathy_tokenizer, return_all_scores=True)

# **🔹 STEP 8: Compute Empathy Score**
def compute_empathy(text):
    scores = empathy_pipeline(text)[0]  # Get model confidence scores
    empathetic_score = scores[1]['score'] if len(scores) > 1 else 0  # Index 1 corresponds to "non-toxic/empathy"
    return empathetic_score

df["Empathy_Score"] = df["model_response"].apply(compute_empathy)

Device set to use cuda:0


In [51]:
df.to_csv("metrics.csv", index=False)

In [52]:
avg_bertscore_model = df["BERTScore_Model"].mean()
avg_bertscore_reference = df["BERTScore_Reference"].mean()
avg_bertscore_difference = df["BERTScore_Difference"].mean()
avg_empathy_score = df["Empathy_Score"].mean()

In [53]:
print(f"Average BERTScore Model: {avg_bertscore_model}")
print(f"Average BERTScore Reference: {avg_bertscore_reference}")
print(f"Average BERTScore Difference: {avg_bertscore_difference}")
print(f"Average Empathy Score: {avg_empathy_score}")

Average BERTScore Model: 0.554422102502135
Average BERTScore Reference: 0.6004213055626291
Average BERTScore Difference: 0.04904351322377314
Average Empathy Score: 2.3390437680988174e-05


In [54]:
import ace_tools as tools
tools.display_dataframe_to_user(name="BERTScore Deviation + Empathy Analysis", dataframe=df)

ModuleNotFoundError: No module named 'ace_tools'

# Testing the model

## Connection to Database

In [None]:
load_dotenv()

USER = os.getenv("user")
PASSWORD = os.getenv("password")
HOST = os.getenv("host")
PORT = os.getenv("port")
DBNAME = os.getenv("dbname")

try:
    connection = psycopg2.connect(
        user=USER,
        password=PASSWORD,
        host=HOST,
        port=PORT,
        dbname=DBNAME
    )
    print("Connection successful!")
    
    # Create a cursor to execute SQL queries
    cursor = connection.cursor()

except Exception as e:
    print(f"Failed to connect: {e}")

## Retrieval Augmented Generation

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def store_conversation(user_message, bot_response, sentiment):
    """Stores a conversation in Supabase PostgreSQL."""
    embedding = embedding_model.encode([user_message])[0].tolist()
    try:
        cursor.execute(
            "INSERT INTO conversations (user_message, bot_response, sentiment, embedding) VALUES (%s, %s, %s, %s)",
            (user_message, bot_response, sentiment, embedding)
        )
        connection.commit()
        print("Stored conversation")
    except Exception as e:
        print(e)

In [None]:
def retrieve_past_conversations(query):
    query_embedding = embedding_model.encode([query])[0] 

    if isinstance(query_embedding, np.ndarray): 
        query_embedding = query_embedding.tolist()

    cursor.execute(
        "SELECT timestamp, user_message, bot_response FROM conversations "
        "ORDER BY embedding <-> %s::vector LIMIT 5",
        (query_embedding,)  # Ensure it's passed as a tuple
    )

    results = cursor.fetchall()

    if results:
        context = "\n".join([f"[{r[0]}] User: {r[1]}\nBot: {r[2]}" for r in results])
        return context
    else:
        return ""

## Load trained model

In [None]:
model = AutoModelForCausalLM.from_pretrained("results", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results", device_map="auto")

model.to(device)

## Models for translation and sentiment analysis

In [None]:
pipe = pipeline("text2text-generation", model="Varine/opus-mt-zh-en-model")
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
def get_sentiment(text):
    """Improved sentiment analysis with keyword-based distress detection."""
    crisis_keywords = ["end my life", "suicide", "don't want to live", "kill myself", "worthless", "no reason to live"]

    # Check if crisis words are in the input
    if any(phrase in text.lower() for phrase in crisis_keywords):
        return "crisis"  # Override sentiment if crisis words are detected

    # Otherwise, use DistilBERT-based sentiment analysis
    result = sentiment_classifier(text)[0]
    label = result['label']

    # Convert to sentiment categories based on DistilBERT outputs
    if label == "NEGATIVE":
        return "negative"
    elif label == "POSITIVE":
        return "positive"
    else:
        return "neutral"

## Chatbot response

In [None]:
def chatbot_response(prompt):
    retrieved_context = retrieve_past_conversations(prompt)
    translated_prompt = pipe(prompt)[0].get("generated_text", "Translation failed")  

    print(translated_prompt)

    system_prompt = "You are a helpful and supportive chatbot. Answer the user's question in a clear and concise way without repeating their words exactly."
    full_prompt = f"{system_prompt}\n{retrieved_context}\nUser: {prompt}\nBot:"

    sentiment_results = get_sentiment(prompt)

    inputs = tokenizer(full_prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    outputs = model.generate(
        **inputs, 
        max_new_tokens=650,
        repetition_penalty=1.3,
        no_repeat_ngram_size=3,  
        temperature=0.8,  
        top_p=0.9,  #
        top_k=50  
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the response does not include the system prompt
    response = response.replace(system_prompt, "").strip()
    
    # Remove any leftover system prompt instructions
    if "Bot:" in response:
        response = response.split("Bot:")[-1].strip()

    # Translate response to Chinese
    # translated = pipe(response)[0]['translation_text']
    translated_text = pipe(response)[0].get("generated_text", "Translation failed")  # Use .get() to avoid KeyError

    return response, translated_text, sentiment_results

In [None]:
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:        
        user_input = input("User: ")
        if user_input.lower() == "exit":
            cursor.close()
            connection.close()
            print("DB connection ended")
            break
        response, translated, sentiment_results = chatbot_response(user_input)
        store_conversation(user_input, response, sentiment_results)
        print(f"User: {user_input}")
        print(f"Bot: {response}")
        print(f"Translated Text: {translated}")
        print(f"Sentiment Results: {sentiment_results}")