In [2]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv("ptsd_secondary_dataset.csv")

print("Dataset shape:", df.shape)
print("\nSeverity class distribution:")
print(df["severity_class"].value_counts())


Dataset shape: (18908, 36)

Severity class distribution:
severity_class
High        6415
Low         6349
Moderate    6144
Name: count, dtype: int64


Define TEXT-ONLY Meta Features (CRITICAL)

  permanently remove social media features here

In [4]:
TEXT_META_COLS = [
    "sentiment", "trauma_count",
    "empath_fear", "empath_violence", "empath_sadness",
    "empath_anger", "empath_aggression",
    "empath_negative_emotion", "empath_suffering", "empath_death",
    "nrc_anger", "nrc_fear", "nrc_sadness", "nrc_disgust",
    "nrc_joy", "nrc_trust", "nrc_anticipation", "nrc_surprise",
    "flesch", "smog", "dale_chall",
    "word_count", "sentence_count", "toxicity"
]

print("Number of text-only meta features:", len(TEXT_META_COLS))


Number of text-only meta features: 24


oad SBERT Embeddings

In [5]:
embeddings = np.load("sbert_embeddings.npy")

print("SBERT embeddings shape:", embeddings.shape)


SBERT embeddings shape: (18908, 384)


Build Feature Matrix (X) and Labels (y)

In [6]:
X_meta = df[TEXT_META_COLS].values
y = df["label"].values

print("Meta features shape:", X_meta.shape)
print("Labels shape:", y.shape)


Meta features shape: (18908, 24)
Labels shape: (18908,)


â€” Scale Meta Features

In [7]:
scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta)

print("Scaled meta features shape:", X_meta_scaled.shape)


Scaled meta features shape: (18908, 24)


Final Feature Vector

In [8]:
X = np.hstack([embeddings, X_meta_scaled])

print("Final feature matrix X shape:", X.shape)


Final feature matrix X shape: (18908, 408)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (15126, 408)
Test shape: (3782, 408)


Logistic Regression (Baseline)

In [10]:
log_reg = LogisticRegression(
    max_iter=2000,
    multi_class="multinomial",
    n_jobs=-1
)

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Macro:", f1_score(y_test, y_pred_lr, average="macro"))
print(classification_report(y_test, y_pred_lr, target_names=["Low","Moderate","High"]))




=== Logistic Regression ===
Accuracy: 0.49338974087784243
F1 Macro: 0.48296253373731796
              precision    recall  f1-score   support

         Low       0.54      0.59      0.56      1270
    Moderate       0.41      0.30      0.35      1229
        High       0.50      0.59      0.54      1283

    accuracy                           0.49      3782
   macro avg       0.48      0.49      0.48      3782
weighted avg       0.48      0.49      0.48      3782



Linear SVM

In [11]:
svm = LinearSVC(C=1.0)

svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print("=== Linear SVM ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1 Macro:", f1_score(y_test, y_pred_svm, average="macro"))
print(classification_report(y_test, y_pred_svm, target_names=["Low","Moderate","High"]))


=== Linear SVM ===
Accuracy: 0.48757271285034376
F1 Macro: 0.4695406837703166
              precision    recall  f1-score   support

         Low       0.52      0.60      0.56      1270
    Moderate       0.41      0.24      0.31      1229
        High       0.49      0.61      0.54      1283

    accuracy                           0.49      3782
   macro avg       0.47      0.48      0.47      3782
weighted avg       0.48      0.49      0.47      3782



Random Forest (Non-linear)

In [12]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Macro:", f1_score(y_test, y_pred_rf, average="macro"))
print(classification_report(y_test, y_pred_rf, target_names=["Low","Moderate","High"]))


=== Random Forest ===
Accuracy: 0.4936541512427287
F1 Macro: 0.473430294648818
              precision    recall  f1-score   support

         Low       0.51      0.64      0.57      1270
    Moderate       0.41      0.24      0.30      1229
        High       0.51      0.59      0.55      1283

    accuracy                           0.49      3782
   macro avg       0.48      0.49      0.47      3782
weighted avg       0.48      0.49      0.48      3782



In [13]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Linear SVM", "Random Forest"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_rf)
    ],
    "F1 Macro": [
        f1_score(y_test, y_pred_lr, average="macro"),
        f1_score(y_test, y_pred_svm, average="macro"),
        f1_score(y_test, y_pred_rf, average="macro")
    ]
})

results.sort_values(by="F1 Macro", ascending=False)


Unnamed: 0,Model,Accuracy,F1 Macro
0,Logistic Regression,0.49339,0.482963
2,Random Forest,0.493654,0.47343
1,Linear SVM,0.487573,0.469541


Select Best Model

In [14]:
best_model = log_reg
model_name = "Logistic regreation"


Save FINAL TEXT-ONLY MODEL

In [15]:
final_model_package = {
    "model": best_model,
    "scaler": scaler,
    "meta_cols": TEXT_META_COLS,
    "meta_means": df[TEXT_META_COLS].mean().to_dict(),
    "label_map": {0: "Low", 1: "Moderate", 2: "High"},
    "embedding_dim": embeddings.shape[1],
    "model_name": model_name
}

joblib.dump(final_model_package, "final_ptsd_textonly_model.pkl")

print("Saved model:", model_name)


Saved model: Logistic regreation
