<a href="https://colab.research.google.com/github/Zalvessa/Mood-model/blob/main/Final_full_EDVersion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

project_dir = "ed_project"
os.makedirs(project_dir, exist_ok=True)

# === requirements.txt ===
requirements = """\
numpy==1.26.4
scipy==1.11.4
torch==2.1.2
torchvision==0.16.2
torchaudio==2.1.2
transformers==4.35.2
peft==0.7.1
accelerate==0.24.1
scikit-learn
textblob
vaderSentiment
pandas
datasets
"""

with open(os.path.join(project_dir, "requirements.txt"), "w") as f:
    f.write(requirements.strip())

# === train_ed_model.py ===
training_script = '''\
import os
import json
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, Features, Value


# Paths
PROJECT_DIR = "/content/ed_project"

# === LOAD DATA ===
df = pd.read_csv(f"{PROJECT_DIR}/synthetic_journal_data.csv")

# === LOAD LABEL SCHEMA ===
with open(f"{PROJECT_DIR}/label_schema.json", "r") as f:
    label_schema = json.load(f)


label_names = list(label_schema.keys())
num_labels = len(label_names)
print("✅ Loaded label schema with", num_labels, "groups")
print("✅ Label names:", label_names)

# ✅ Parse labels from single column to list of ints
df["labels"] = df["labels"].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
print("✅ First label vector:", df["labels"].iloc[0])
print("✅ Label vector length:", len(df["labels"].iloc[0]))


# ✅ Inspect label distribution across all classes
all_labels = np.array(df["labels"].to_list())
label_counts = all_labels.sum(axis=0)
print("✅ Label counts per class:")
for name, count in zip(label_names, label_counts):
    print(f"  {name}: {int(count)}")




# ✅ Optionally sample a smaller dataset
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# ✅ Train/test split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)


# ✅ Keep only necessary columns and reset index
train_df = train_df[["journal", "labels"]].copy().reset_index(drop=True)
val_df = val_df[["journal", "labels"]].copy().reset_index(drop=True)

# ✅ Cast labels to float32 (matches HuggingFace feature spec)
train_df["labels"] = train_df["labels"].apply(lambda x: [np.float32(i) for i in x])
val_df["labels"] = val_df["labels"].apply(lambda x: [np.float32(i) for i in x])



# ✅ Dataset features
features = Features({
    "journal": Value("string"),
    "labels": [Value("float32")] * num_labels
})

# ✅ Convert to HuggingFace Datasets
train_dataset = Dataset.from_pandas(train_df[["journal", "labels"]], features=features)
val_dataset = Dataset.from_pandas(val_df[["journal", "labels"]], features=features)

# ✅ Tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

# ✅ Tokenize function
def tokenize(batch):
    return tokenizer(batch["journal"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ✅ Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

MODEL_OUTPUT_DIR = "/content/ed_classifier"
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)


# ✅ Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,  # Save model here
    evaluation_strategy="epoch",
    save_strategy="no",   # disables checkpoint saving during training
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir=os.path.join(MODEL_OUTPUT_DIR, "logs"),  # Save logs here too
    load_best_model_at_end=False,  # must be False if no checkpoints saved
    metric_for_best_model="f1"
)



# ✅ Metrics function
def compute_metrics(pred):
    logits, labels = pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels.astype(int)
    f1 = f1_score(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}






# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ✅ Train
trainer.train()
# ✅ Save model manually to ensure weights get written


# Re-save the model manually
model.save_pretrained(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)





# ✅ Test a forward pass to see raw probabilities
with torch.no_grad():
    sample = train_dataset[0]
    input_ids = sample["input_ids"].unsqueeze(0)
    attention_mask = sample["attention_mask"].unsqueeze(0)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    probs = torch.sigmoid(outputs.logits)
    print("🔍 Sample output probs:", probs.squeeze().tolist())


# ✅ Save
trainer.save_model(MODEL_OUTPUT_DIR)

print("✅ Model saved.")
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)


# Save label names into the model directory
with open(os.path.join(MODEL_OUTPUT_DIR, "label_names.json"), "w") as f:
    json.dump(label_names, f)

print("✅ Model trained and saved.")

'''

with open(os.path.join(project_dir, "train_ed_model.py"), "w") as f:
    f.write(training_script)

print(f"✅ setup_files.py completed: Files generated in '{project_dir}/'")


✅ setup_files.py completed: Files generated in 'ed_project/'


In [None]:
!pip uninstall -y transformers torch torchvision torchaudio numpy scipy peft accelerate
!pip install -r ed_project/requirements.txt

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
Found existing installation: torch 2.1.2
Uninstalling torch-2.1.2:
  Successfully uninstalled torch-2.1.2
Found existing installation: torchvision 0.16.2
Uninstalling torchvision-0.16.2:
  Successfully uninstalled torchvision-0.16.2
Found existing installation: torchaudio 2.1.2
Uninstalling torchaudio-2.1.2:
  Successfully uninstalled torchaudio-2.1.2
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.11.4
Uninstalling scipy-1.11.4:
  Successfully uninstalled scipy-1.11.4
Found existing installation: peft 0.7.1
Uninstalling peft-0.7.1:
  Successfully uninstalled peft-0.7.1
Found existing installation: accelerate 0.24.1
Uninstalling accelerate-0.24.1:
  Successfully uninstalled accelerate-0.24.1
Collecting numpy==1.26.4 (from -r ed_project/requirements.txt (line 1))

In [None]:
import shutil

shutil.copy("/mnt/data/synthetic_journal_data.csv", "synthetic_journal_data.csv")
shutil.copy("/mnt/data/label_schema.json", "label_schema.json")


In [None]:
!python ed_project/train_ed_model.py

2025-08-07 13:48:26.788944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754574506.853775  119500 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754574506.873142  119500 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754574506.961333  119500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754574506.961410  119500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754574506.961509  119500 computation_placer.cc:177] computation placer alr

In [None]:
import os

# === Colab cell: write predict.py to ed_project ===
predict_script = """
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import json
import os

# === Paths ===
MODEL_DIR = "ed_classifier"
LABEL_FILE = os.path.join(MODEL_DIR, "label_names.json")

# === Define emotion and behavior groups ===
EMOTION_LABELS = {
    "guilt", "anxiety", "shame", "sadness", "fear", "hopelessness",
    "self_criticism", "embarrassment", "hope", "confidence", "calm",
    "self_acceptance", "gratitude", "relief", "pride", "motivation"
}

# Everything else = behavior (fallback)
def get_label_type(label):
    return "Emotion" if label in EMOTION_LABELS else "Behavior"

# === Load model and tokenizer ===
print("🔁 Loading model and tokenizer...")
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

# === Load label names ===
with open(LABEL_FILE, "r") as f:
    label_names = json.load(f)

# === Prompt ===
print("\\n📝 Describe how your day went in terms of eating, emotions, and any strategies you used to cope.\\n")
prompt=""
journal_input = input(prompt)

# === Tokenize input ===
inputs = tokenizer(journal_input, return_tensors="pt", truncation=True, padding=True)

# === Predict ===
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).squeeze().tolist()

# === Get Top 5 predictions ===
top_n = 5
top_indices = sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:top_n]

print("\\n📊 Top 5 Predicted Labels:")
print("-" * 30)
for i in top_indices:
    label = label_names[i]
    label_type = get_label_type(label)
    prob = probs[i]
    print(f"{label} ({label_type}): {prob:.3f}")
"""

# Make sure directory exists
os.makedirs("ed_project", exist_ok=True)

# Write to file
with open("ed_project/predict.py", "w") as f:
    f.write(predict_script)

print("✅ predict.py created at ed_project/predict.py")


✅ predict.py created at ed_project/predict.py


In [None]:
# === Colab cell: run predict.py ===
!python ed_project/predict.py


🔁 Loading model and tokenizer...

📝 Describe how your day went in terms of eating, emotions, and any strategies you used to cope.

I've been eating three regular meals every day this week. I’m motivated to keep working on recovery

📊 Top 5 Predicted Labels:
------------------------------
tracking_progress (Behavior): 0.165
consistent_eating (Behavior): 0.151
pride (Emotion): 0.104
motivation (Emotion): 0.083
hopelessness (Emotion): 0.060
