# Importing libraries

In [1]:
import torch
import os
import psycopg2

import pandas as pd
import numpy as np

from bert_score import score
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from dotenv import load_dotenv

ImportError: dlopen(/Users/alvinwong/anaconda3/envs/python3.9/lib/python3.9/site-packages/psycopg2/_psycopg.cpython-39-darwin.so, 0x0002): symbol not found in flat namespace '_PQbackendPID'

# Defining file paths

In [21]:
OUTPUT_DIR = "results"
LOG_DIR = "logs"
TRAIN_FILE = "dataset/train.txt"  # Path to your training data file
VAL_FILE = "dataset/validation.txt"  # Path to your validation data file
TEST_FILE = "dataset/test.txt"  # Path to your test data file

# Loading Pre-trained Model

In [22]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

In [23]:
original_vocab_size = model.config.vocab_size
print(f"Original vocab size: {original_vocab_size}")

Original vocab size: 49152


In [24]:
print(f"Model max length: {model.config.max_position_embeddings}")

Model max length: 8192


In [25]:
print(f"Max sequence length: {tokenizer.model_max_length}")

Max sequence length: 8192


In [26]:
model.gradient_checkpointing_enable()
model.resize_token_embeddings(len(tokenizer))

Embedding(49152, 960, padding_idx=2)

# Setting the device

In [27]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [36]:
def count_tokens(file_path, tokenizer):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

# Count tokens in training and validation files
train_token_count = count_tokens(TRAIN_FILE, tokenizer)
eval_token_count = count_tokens(VAL_FILE, tokenizer)
print(f"Train Tokens: {train_token_count}")
print(f"Validation Tokens: {eval_token_count}")

Train Tokens: 27546
Validation Tokens: 7033


# Training the model

## Setting up training arguments

In [33]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    logging_strategy="epoch",     # Logs loss at intervals
    learning_rate=1e-5,
    per_device_train_batch_size=1,  # Reduced batch size for limited GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=LOG_DIR,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch sizes
    fp16=True,
    bf16=False,
    optim="adamw_torch"
)

## Loading dataset

In [34]:
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("user:"):
                user_input = line.replace("user:", "").strip()
            elif line.startswith("bot:"):
                bot_response = line.replace("bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

In [37]:
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

print(f"Length of training dataset: {len(df_train)}")
print(f"Length of validation dataset: {len(df_val)}")

Length of training dataset: 327
Length of validation dataset: 82


## Tokenizing dataset

In [38]:
def tokenize_function(examples):
    """
    Tokenizes user-bot conversation pairs efficiently while handling long sequences
    by chunking messages and preventing excessive padding.
    """
    model_inputs = {
        "input_ids": [],
        "attention_mask": [],
        "labels": [],
    }

    for user_msg, bot_msg in zip(examples["input"], examples["output"]):
        # Combine each user-bot pair
        text_pair = f"User: {user_msg} Bot: {bot_msg}"

        # Tokenize with truncation and padding
        tokenized = tokenizer(
            text_pair,
            max_length=8192,  # Change if using a different model with a smaller/larger limit
            truncation=True,  # Truncate if longer than max_length
            padding="longest",  # Use "longest" instead of "max_length" to save space
            return_tensors="pt",
        )

        # Store tokenized outputs
        model_inputs["input_ids"].append(tokenized["input_ids"].squeeze(0))
        model_inputs["attention_mask"].append(tokenized["attention_mask"].squeeze(0))

        # Labels for training: Shift left for causal language modeling
        labels = tokenized["input_ids"].clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding tokens in loss
        model_inputs["labels"].append(labels.squeeze(0))

    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/327 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

## Data collator 

In [39]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked language modeling is not used for causal LM
)

## Compute Metrics

In [40]:
def compute_bertscore(eval_pred):
    predictions, labels = eval_pred
    
    # Decode tokenized predictions & labels into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BERTScore
    P, R, F1 = score(decoded_preds, decoded_labels, lang="en", rescale_with_baseline=True)
    
    return {"bert_score_f1": F1.mean().item()}


## Initialising the model

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_bertscore,
)

trainer.train()

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,1.6979,1.612151
2,1.3959,1.559951
3,1.2312,1.547998
4,1.105,1.556723
5,0.9885,1.587398
6,0.8933,1.611524
7,0.8192,1.64492
8,0.7545,1.670334
9,0.6872,1.696878


TrainOutput(global_step=810, training_loss=1.032991122022087, metrics={'train_runtime': 3639.7207, 'train_samples_per_second': 0.898, 'train_steps_per_second': 0.223, 'total_flos': 501429084076800.0, 'train_loss': 1.032991122022087, 'epoch': 9.880733944954128})

## Saving Trained Model

In [42]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

('results\\tokenizer_config.json',
 'results\\special_tokens_map.json',
 'results\\vocab.json',
 'results\\merges.txt',
 'results\\added_tokens.json',
 'results\\tokenizer.json')

# Evaluating the model

## Connection to Database

In [1]:
load_dotenv()

USER = os.getenv("user")
PASSWORD = os.getenv("password")
HOST = os.getenv("host")
PORT = os.getenv("port")
DBNAME = os.getenv("dbname")

try:
    connection = psycopg2.connect(
        user=USER,
        password=PASSWORD,
        host=HOST,
        port=PORT,
        dbname=DBNAME
    )
    print("Connection successful!")
    
    # Create a cursor to execute SQL queries
    cursor = connection.cursor()

except Exception as e:
    print(f"Failed to connect: {e}")

NameError: name 'load_dotenv' is not defined

## Retrieval Augmented Generation

In [44]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def store_conversation(user_message, bot_response, sentiment):
    """Stores a conversation in Supabase PostgreSQL."""
    embedding = embedding_model.encode([user_message])[0]  # Generate 1024-dim vector
    try:
        cursor.execute(
            "INSERT INTO conversations (user_message, bot_response, sentiment, embedding) VALUES (%s, %s, %s, %s)",
            (user_message, bot_response, sentiment, embedding)
        )
        connection.commit()
        print("Stored conversation")
    except Exception as e:
        print(e)

In [None]:
def retrieve_past_conversations(query):
    """Retrieves the most relevant past conversations using vector similarity."""
    query_embedding = embedding_model.encode([query])[0]  # Generates a NumPy array

    # 🔹 Convert NumPy array to a Python list
    query_embedding = query_embedding.tolist()

    cursor.execute(
        "SELECT timestamp, user_message, bot_response FROM conversations ORDER BY embedding <-> %s LIMIT 5",
        (query_embedding,)  # 🔹 Ensure it's inside a tuple (trailing comma is needed)
    )

    results = cursor.fetchall()

    if results:
        context = "\n".join([f"[{r[0]}] User: {r[1]}\nBot: {r[2]}" for r in results])
        return context
    else:
        return "No past conversations found."


## Load trained model

In [47]:
model = AutoModelForCausalLM.from_pretrained("results", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results", device_map="auto")

model.to(device)



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
    (rotary_emb)

## Models for translation and sentiment analysis

In [48]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-zh")
sentiment_classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

Device set to use cuda:0
Device set to use cuda:0


In [49]:
def get_sentiment(text):
    """Improved sentiment analysis with keyword-based distress detection."""
    crisis_keywords = ["end my life", "suicide", "don't want to live", "kill myself", "worthless", "no reason to live"]

    # Check if crisis words are in the input
    if any(phrase in text.lower() for phrase in crisis_keywords):
        return "crisis"  # Override sentiment if crisis words are detected

    # Otherwise, use DistilBERT-based sentiment analysis
    result = sentiment_classifier(text)[0]
    label = result['label']

    # Convert to sentiment categories based on DistilBERT outputs
    if label == "NEGATIVE":
        return "negative"
    elif label == "POSITIVE":
        return "positive"
    else:
        return "neutral"

## Chatbot response

In [50]:
def chatbot_response(prompt):
    retrieved_context = retrieve_past_conversations(prompt)

    system_prompt = "You are a helpful and supportive chatbot. Answer the user's question in a clear and concise way without repeating their words exactly."
    full_prompt = f"{system_prompt}\n{retrieved_context}\nUser: {prompt}\nBot:"

    sentiment_results = get_sentiment(prompt)

    inputs = tokenizer(full_prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    outputs = model.generate(
        **inputs, 
        repetition_penalty=1.3,
        no_repeat_ngram_size=3,  
        temperature=0.8,  
        top_p=0.9,  #
        top_k=50  
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the response does not include the system prompt
    response = response.replace(system_prompt, "").strip()
    
    # Remove any leftover system prompt instructions
    if "Bot:" in response:
        response = response.split("Bot:")[-1].strip()

    # Translate response to Chinese
    translated = pipe(response)[0]['translation_text']

    return response, translated, sentiment_results

In [51]:
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:        
        user_input = input("User: ")
        if user_input.lower() == "exit":
            cursor.close()
            connection.close()
            print("DB connection ended")
            break
        response, translated, sentiment_results = chatbot_response(user_input)
        store_conversation(user_input, response, sentiment_results)
        print(f"User: {user_input}")
        print(f"Bot: {response}")
        print(f"Translated Text: {translated}")
        print(f"Sentiment Results: {sentiment_results}")

Chatbot is ready! Type 'exit' to stop.


ProgrammingError: can't adapt type 'numpy.float32'