In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")

# Enable gradient checkpointing (optional for large models)
model.gradient_checkpointing_enable()

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

model.to(device)

# Load and preprocess the data
def load_chatbot_data(file_path):
    """Load and preprocess chatbot data from the given text file."""
    conversations = []
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        user_input, bot_response = None, None
        for line in lines:
            if line.startswith("User:"):
                user_input = line.replace("User:", "").strip()
            elif line.startswith("Bot:"):
                bot_response = line.replace("Bot:", "").strip()
                if user_input and bot_response:
                    conversations.append({"input": user_input, "output": bot_response})
                    user_input, bot_response = None, None
    return pd.DataFrame(conversations)

# Prepare datasets
TRAIN_FILE = "dataset/train.txt"
VAL_FILE = "dataset/val.txt"
df_train = load_chatbot_data(TRAIN_FILE)
df_val = load_chatbot_data(VAL_FILE)

print(df_train.isnull().sum())
print(df_val.isnull().sum())

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["output"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    
    model_inputs = {key: torch.tensor(val).to(device) for key, val in model_inputs.items()}  

    return model_inputs

# Apply tokenization
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val = dataset_val.map(tokenize_function, batched=True)

# ✅ Use the correct data collator for a causal model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Important: Disable MLM for autoregressive models
)

# Training arguments
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,  # Adjust based on GPU memory
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="logs",
    max_grad_norm=1.0, # Clips gradients to prevent instability
    gradient_accumulation_steps=2,  # Accumulates gradients over 2 steps
    fp16=True,  # Enable mixed precision for better GPU performance
    bf16=False,  # Disable bf16
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,  # ✅ Correct collator for causal models
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("results/test1_cb") 
tokenizer.save_pretrained("results/test1_cb")


  from .autonotebook import tqdm as notebook_tqdm


input     0
output    0
dtype: int64
input     0
output    0
dtype: int64


Map: 100%|██████████| 96/96 [00:00<00:00, 1884.38 examples/s]
Map: 100%|██████████| 24/24 [00:00<00:00, 2460.72 examples/s]
  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,No log,
2,No log,1.88994
3,No log,
4,No log,2.013973
5,No log,2.093243
6,No log,2.122142
7,No log,2.143192
8,No log,2.156403
9,No log,2.155811
10,No log,2.155906


('results/test1_cb\\tokenizer_config.json',
 'results/test1_cb\\special_tokens_map.json',
 'results/test1_cb\\vocab.json',
 'results/test1_cb\\merges.txt',
 'results/test1_cb\\added_tokens.json',
 'results/test1_cb\\tokenizer.json')

In [3]:
model = AutoModelForCausalLM.from_pretrained("results/test1_cb", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("results/test1_cb", device_map="auto")

In [4]:
def chatbot_response(prompt, max_length=100):
    # Tokenize input prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generate response
    outputs = model.generate(**inputs, max_length=max_length)

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
if __name__ == "__main__":
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        response = chatbot_response(user_input)
        print(f"User: {user_input}")
        print(f"Bot: {response}")

Chatbot is ready! Type 'exit' to stop.
User: tell me more about depression
Bot: tell me more about depression."

By asking open-ended questions, you can encourage the person to share more about their feelings and experiences. Listen attentively to their responses, and respond with empathy and understanding. Avoid giving unsolicited advice or trying to "fix" the person's problems. Instead, focus on providing a supportive and non-judgmental space for them to express themselves.

For example, you could say, "I'm here for you, and I want to support you
