In [None]:
# Cell 0 — Environment setup (run once at the start of the notebook)
# Comments:
# - Installs required libraries for feature extraction and embeddings.
# - Downloads minimal NLTK corpora needed by NRCLex, Empath, VADER, etc.
# - We intentionally avoid textblob heavy installs and Detoxify to keep things robust.

# Install packages (silent mode)
!pip install -q sentence-transformers vaderSentiment nltk empath nrclex textstat joblib

# Imports and NLTK downloads
import nltk, sys, subprocess

# Download small set of NLTK resources we will use
nltk.download('punkt')                       # tokenizer
nltk.download('punkt_tab')                   # some libs expect this variant
nltk.download('vader_lexicon')               # VADER sentiment
nltk.download('averaged_perceptron_tagger')  # POS tagger used by some lexicons
nltk.download('wordnet')
nltk.download('omw-1.4')

# Print quick confirmation
print("Cell 0 done — packages installed and NLTK corpora downloaded.")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Building wheel for nrclex (setup.py) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Cell 0 done — packages installed and NLTK corpora downloaded.


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Cell 1 — Load CSV robustly and create combined text_raw
# Comments:
# - Tries multiple encodings to avoid UnicodeDecodeError (common with social media exports).
# - Creates `text_raw` = message + description (keeps both if present).
# - Ensures reaction columns exist and are numeric.

import os
import pandas as pd
import numpy as np

FNAME = "PTSD data_20k.csv"
assert os.path.exists(FNAME), f"File not found in working dir: {FNAME}"

# Try several encodings
for enc in ("utf-8", "latin1", "cp1252"):
    try:
        df = pd.read_csv(FNAME, encoding=enc, low_memory=False)
        print("Loaded using encoding:", enc)
        break
    except Exception as e:
        last_err = e
else:
    raise last_err

print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())

# Ensure description/message exist
df['description'] = df.get('description', "").fillna("")
df['message'] = df.get('message', "").fillna("")

# Combined raw text (message + description)
df['text_raw'] = (df['message'].astype(str).str.strip() + " " + df['description'].astype(str).str.strip()).str.strip()
df.loc[df['text_raw'].str.len() == 0, 'text_raw'] = np.nan

# Ensure reaction columns exist and are numeric
reaction_cols = ['react_angry','react_haha','react_like','react_love','react_sad','react_wow','shares']
for c in reaction_cols:
    if c not in df.columns:
        df[c] = 0
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

# Quick sanity outputs to paste back:
print("\n--- PLEASE PASTE THESE 3 OUTPUTS BACK ---")
print("1) df.shape ->", df.shape)
first_text = df['text_raw'].dropna().iloc[0] if df['text_raw'].dropna().shape[0] > 0 else ""
print("2) First non-null text_raw ->", repr(first_text[:300]))  # show up to 300 chars
print("3) Reaction sums ->", df[reaction_cols].sum().to_dict())

# keep df in memory for next cells


Loaded using encoding: latin1
Initial shape: (19850, 14)
Columns: ['created_time', 'description', 'link', 'message', 'page_id', 'post_id', 'react_angry', 'react_haha', 'react_like', 'react_love', 'react_sad', 'react_wow', 'scrape_time', 'shares']

--- PLEASE PASTE THESE 3 OUTPUTS BACK ---
1) df.shape -> (19850, 15)
2) First non-null text_raw -> "We are #LIVE outside the National Rifle Association of America's headquarters for a demonstration organised by the Women's March. The BBC's Hetal Gandhi is speaking to NRA supporters and activists from the Women's March about the recent verdict in #PhilandoCastile's death and a controversial NRA com"
3) Reaction sums -> {'react_angry': 3672555, 'react_haha': 3420877, 'react_like': 21433486, 'react_love': 1967878, 'react_sad': 1664556, 'react_wow': 1631093, 'shares': 7327573}


In [None]:
# Cell 2 — Clean text, remove empty/short/duplicate rows, normalize

import re
import pandas as pd
import numpy as np

print("Starting shape (before cleaning):", df.shape)

# Remove rows with no text_raw
df = df.dropna(subset=['text_raw'])
print("After dropping NaN text_raw:", df.shape)

# Normalize whitespace
df['text'] = df['text_raw'].str.strip()

# Remove URLs
df['text'] = df['text'].str.replace(r"http\S+|www\.\S+", "", regex=True)

# Lowercase
df['text'] = df['text'].str.lower()

# Remove unnecessary characters (keep letters, numbers, punctuation)
df['text'] = df['text'].str.replace(r"[^a-zA-Z0-9\s\.,!?'\-#]", " ", regex=True)

# Remove extra spaces
df['text'] = df['text'].str.replace(r"\s+", " ", regex=True).str.strip()

# Drop short texts (< 5 characters)
before = df.shape[0]
df = df[df['text'].str.len() >= 5]
print(f"Dropped {before - df.shape[0]} rows (too short). New shape:", df.shape)

# Drop duplicate texts
before = df.shape[0]
df = df.drop_duplicates(subset=['text'])
print(f"Dropped {before - df.shape[0]} duplicate texts. New shape:", df.shape)

# Reset index
df = df.reset_index(drop=True)

print("\n--- CLEANING DONE ---")
print("Final cleaned shape:", df.shape)
print("\nSample cleaned text:\n", df['text'].iloc[0])


Starting shape (before cleaning): (19850, 15)
After dropping NaN text_raw: (19781, 15)
Dropped 177 rows (too short). New shape: (19604, 16)
Dropped 696 duplicate texts. New shape: (18908, 16)

--- CLEANING DONE ---
Final cleaned shape: (18908, 16)

Sample cleaned text:
 we are #live outside the national rifle association of america's headquarters for a demonstration organised by the women's march. the bbc's hetal gandhi is speaking to nra supporters and activists from the women's march about the recent verdict in #philandocastile's death and a controversial nra commercial - that sparked this event. share your questions below.


Now we add severity score and three-class labels (Low / Moderate / High).
This uses the reaction-based formula we defined earlier:

severity_score
=
angry
+
sad
+
wow
+
haha
severity_score=angry+sad+wow+haha

Then we convert raw severity scores into three quantile-based classes:

Low = bottom 33%

Moderate = middle 33%

High = top 33%

This ensures balanced classes, unlike fixed ranges like 0–30 / 31–50, which don’t generalize well to real data.

In [None]:
# Cell 3 — Severity score + Balanced class labels (Low / Moderate / High)

import numpy as np
import pandas as pd

# Reaction-based severity score
df['severity_score'] = (
    df['react_angry'] +
    df['react_sad'] +
    df['react_wow'] +
    df['react_haha']
)

print("Severity score statistics:")
print(df['severity_score'].describe())

# Compute quantile thresholds
q1 = df['severity_score'].quantile(0.33)
q2 = df['severity_score'].quantile(0.66)

print("\nQuantile thresholds:")
print("33% =", q1)
print("66% =", q2)

# Assign three classes based on quantiles
def assign_label(score):
    if score <= q1:
        return "Low"
    elif score <= q2:
        return "Moderate"
    else:
        return "High"

df['severity_class'] = df['severity_score'].apply(assign_label)

# Convert to numeric labels for modeling
label_map = {"Low": 0, "Moderate": 1, "High": 2}
df['label'] = df['severity_class'].map(label_map)

print("\nClass distribution:")
print(df['severity_class'].value_counts())

# Peek at first 5 rows
print("\nPreview:")
print(df[['text', 'severity_score', 'severity_class']].head())


Severity score statistics:
count    18908.000000
mean       536.261688
std       1853.946439
min          0.000000
25%         10.000000
50%         62.000000
75%        306.250000
max      65667.000000
Name: severity_score, dtype: float64

Quantile thresholds:
33% = 20.0
66% = 164.0

Class distribution:
severity_class
High        6415
Low         6349
Moderate    6144
Name: count, dtype: int64

Preview:
                                                text  severity_score  \
0  we are #live outside the national rifle associ...             114   
1  update -2 ukrainian tourists killed in stabbin...            1227   
2  proms come with us on a tour of the royal albe...              68   
3  thousands say their final goodbyes to bradley ...            2006   

  severity_class  
0       Moderate  
1           High  
2       Moderate  
3           High  
4           High  


Cell 4 — Sentiment + Trauma Lexicon Feature Extraction

In [None]:
# Cell 4 — Sentiment (VADER) + Trauma Lexicon counts

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Compute sentiment polarity (-1 to +1)
df['sentiment'] = df['text'].apply(lambda t: analyzer.polarity_scores(t)['compound'])

print("Sentiment stats:")
print(df['sentiment'].describe())

# Trauma-related keyword lexicon
trauma_words = [
    "trauma", "ptsd", "abuse", "assault", "violence", "death", "kill", "killed", "murder",
    "shoot", "shot", "attack", "attacked", "bomb", "war", "injury", "suicide", "pain",
    "fear", "panic", "anxiety", "suffering", "hurt", "bleeding", "threat", "crisis"
]

# Count trauma lexicon occurrences
df['trauma_count'] = df['text'].apply(
    lambda t: sum(t.count(w) for w in trauma_words)
)

print("\nTrauma lexicon stats:")
print(df['trauma_count'].describe())

# Preview
print("\nPreview:")
print(df[['text', 'sentiment', 'trauma_count', 'severity_score', 'severity_class']].head())


Sentiment stats:
count    18908.000000
mean         0.004508
std          0.522251
min         -0.995200
25%         -0.421500
50%          0.000000
75%          0.440400
max          0.999600
Name: sentiment, dtype: float64

Trauma lexicon stats:
count    18908.000000
mean         0.244711
std          0.653106
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         10.000000
Name: trauma_count, dtype: float64

Preview:
                                                text  sentiment  trauma_count  \
0  we are #live outside the national rifle associ...     0.1027             1   
1  update -2 ukrainian tourists killed in stabbin...    -0.8689             2   
2  proms come with us on a tour of the royal albe...     0.4939             0   
3  thousands say their final goodbyes to bradley ...    -0.4871             0   

   severity_score severity_class  
0             114       Moderate  
1            1227           High  
2              68   

Cell 5 — Lexical Features (Empath + NRC + Readability + Toxicity Proxy)

This cell:

✔ Adds Empath categories:

fear

violence

sadness

anger

aggression

negative_emotion

suffering

death

✔ Adds NRC emotion categories:

anger

fear

sadness

disgust

joy

trust

anticipation

surprise

✔ Adds lightweight readability features:

Flesch reading ease

SMOG index

Dale-Chall difficulty

word count

sentence count

✔ Adds toxicity proxy (simple lexicon-based score)

In [None]:
# Cell 5 — Empath + NRC + Readability + Toxicity proxy

from empath import Empath
from nrclex import NRCLex
import textstat
import numpy as np
import re
import tqdm

lex = Empath()

# Empath categories
empath_categories = [
    'fear','violence','sadness','anger','aggression',
    'negative_emotion','suffering','death'
]

print("Adding Empath features...")
empath_matrix = []
for txt in tqdm.tqdm(df['text'], desc="Empath"):
    scores = lex.analyze(txt, categories=empath_categories)
    empath_matrix.append([scores[c] for c in empath_categories])

empath_matrix = np.array(empath_matrix)
for i, col in enumerate(empath_categories):
    df[f"empath_{col}"] = empath_matrix[:, i]


# NRC Emotion Lexicon
nrc_emotions = ['anger','fear','sadness','disgust','joy','trust','anticipation','surprise']

print("Adding NRC emotion features...")
nrc_matrix = []
for txt in tqdm.tqdm(df['text'], desc="NRC"):
    emo = NRCLex(txt).raw_emotion_scores
    nrc_matrix.append([emo.get(e, 0) for e in nrc_emotions])

nrc_matrix = np.array(nrc_matrix)
for i, col in enumerate(nrc_emotions):
    df[f"nrc_{col}"] = nrc_matrix[:, i]


# Readability metrics
print("Adding readability features...")
df['flesch'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['smog'] = df['text'].apply(lambda x: textstat.smog_index(x))
df['dale_chall'] = df['text'].apply(lambda x: textstat.dale_chall_readability_score(x))
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['text'].apply(lambda x: textstat.sentence_count(x))


# Toxicity proxy (simple)
print("Adding toxicity proxy...")
tox_words = set([
    "threat","threatened","attack","attacked","kill","killed","murder","bomb",
    "explode","shoot","shot","gun","violence","abuse","assault"
])

def toxicity_score(text):
    words = re.findall(r"\w+", text)
    if not words:
        return 0.0
    hits = sum(1 for w in words if w in tox_words)
    return hits / len(words)

df['toxicity'] = df['text'].apply(toxicity_score)


print("\n--- Lexical Feature Extraction Complete ---")
print("New df shape:", df.shape)
print(df.head(1))


Adding Empath features...


Empath: 100%|██████████| 18908/18908 [00:03<00:00, 5738.95it/s]


Adding NRC emotion features...


NRC: 100%|██████████| 18908/18908 [00:09<00:00, 1938.15it/s]


Adding readability features...
Adding toxicity proxy...

--- Lexical Feature Extraction Complete ---
New df shape: (18908, 43)
               created_time description  \
0  2017-07-14T14:30:59+0000               

                                                link  \
0  https://www.facebook.com/bbcnews/videos/101548...   

                                             message       page_id  \
0  We are #LIVE outside the National Rifle Associ...  2.290000e+11   

                          post_id  react_angry  react_haha  react_like  \
0  228735667216_10154890879532217           54          24         993   

   react_love  ...  nrc_joy  nrc_trust nrc_anticipation  nrc_surprise  \
0         144  ...        1          2                2             1   

      flesch       smog  dale_chall word_count  sentence_count  toxicity  
0  36.598333  14.554593   11.839485         55               3       0.0  

[1 rows x 43 columns]


this fill will generatesbert_embeddings.npy

In [None]:
# Cell 6 — Load existing SBERT embeddings or compute them in safe batches
# Comments:
# - Reuses sbert_embeddings.npy if present (fast).
# - Otherwise computes embeddings in batches and saves a .npy file.
# - Attaches df['embedding'] as a list of vectors for downstream use.

import os, traceback
import numpy as np
from sentence_transformers import SentenceTransformer
import tqdm

EMB_FILE = "sbert_embeddings.npy"
N = len(df)
batch_size = 256   # lower (e.g., 64) if your runtime OOMs; increase if you have lots of RAM

try:
    if os.path.exists(EMB_FILE):
        print("Found existing embeddings file:", EMB_FILE)
        emb = np.load(EMB_FILE)
        print("Loaded embeddings shape:", emb.shape)
    else:
        print("No embeddings file found. Computing SBERT embeddings now (this may take several minutes)...")
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        # detect embedding dim
        probe = model.encode(["probe"], show_progress_bar=False)
        emb_dim = probe.shape[1]
        print("Embedding dimension detected:", emb_dim)
        # preallocate array
        emb = np.zeros((N, emb_dim), dtype=np.float32)
        texts = df['text'].tolist()
        for i in tqdm.tqdm(range(0, N, batch_size), desc="Embedding batches"):
            batch_texts = texts[i:i+batch_size]
            batch_emb = model.encode(batch_texts, show_progress_bar=False)
            emb[i:i+len(batch_emb), :] = batch_emb
        print("Computed embeddings shape:", emb.shape)
        np.save(EMB_FILE, emb)
        print("Saved embeddings to:", EMB_FILE)

    # attach to df
    df['embedding'] = list(emb)
    print("Attached embeddings to df['embedding']. First vector length:", len(df['embedding'].iloc[0]))

except Exception as e:
    print("ERROR during SBERT embedding step:")
    traceback.print_exc()
    raise


No embeddings file found. Computing SBERT embeddings now (this may take several minutes)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding dimension detected: 384


Embedding batches: 100%|██████████| 74/74 [09:27<00:00,  7.67s/it]

Computed embeddings shape: (18908, 384)
Saved embeddings to: sbert_embeddings.npy
Attached embeddings to df['embedding']. First vector length: 384





build the final secondary dataset (no leakage): this will combine the SBERT embeddings + all numeric lexical/meta features except severity_score and save:

ptsd_secondary_dataset.npz → compressed X (features) and y (labels) for modeling

ptsd_secondary_dataset.csv → human-readable CSV with text, labels and meta features (keeps severity_score for inspection, but it won’t be used for modeling)

In [None]:
# Cell 7 — Build final secondary dataset and save (.npz + .csv)
# Comments:
# - Excludes text fields and severity_score from feature matrix X to avoid leakage.
# - Includes embeddings (df['embedding']) + numeric meta columns (sentiment, trauma_count, empath_*, nrc_*, readability, toxicity).
# - Saves compressed NPZ for modeling and a CSV for inspection.

import numpy as np
import pandas as pd
import os

# columns to exclude from meta features
exclude = set([
    'text','text_raw','description','message','embedding',
    'severity_score','created_time','scrape_time','post_id'
])

# build meta column list (numeric only)
meta_cols = [c for c in df.columns if c not in exclude and c not in ('label',)]
# ensure sentiment & trauma_count come first if present
preferred = ['sentiment','trauma_count']
meta_cols = [c for c in preferred if c in meta_cols] + [c for c in meta_cols if c not in preferred]
# keep only numeric meta cols
meta_cols = [c for c in meta_cols if pd.api.types.is_numeric_dtype(df[c])]

print("Meta columns to include (count={}):".format(len(meta_cols)))
print(meta_cols)

# build matrices
emb_matrix = np.vstack(df['embedding'].values)          # (N, D_emb)
meta_matrix = df[meta_cols].fillna(0).values            # (N, K_meta)
X = np.hstack([emb_matrix, meta_matrix])                # (N, D_emb + K_meta)
y = df['label'].values

print("\nEmbedding matrix shape:", emb_matrix.shape)
print("Meta matrix shape:", meta_matrix.shape)
print("Final X shape:", X.shape)
print("y shape:", y.shape)
print("Label distribution:", dict(pd.Series(y).value_counts().sort_index()))

# Save to disk
np.savez_compressed("ptsd_secondary_dataset.npz", X=X, y=y)
print("\nSaved ptsd_secondary_dataset.npz")

# Save CSV for inspection (includes severity_score and classes)
out_csv_cols = ['text','severity_score','severity_class','label'] + meta_cols
df[out_csv_cols].to_csv("ptsd_secondary_dataset.csv", index=False)
print("Saved ptsd_secondary_dataset.csv (for inspection).")
print("\nFiles saved:")
print(" -", os.path.abspath("ptsd_secondary_dataset.npz"))
print(" -", os.path.abspath("ptsd_secondary_dataset.csv"))


Meta columns to include (count=32):
['sentiment', 'trauma_count', 'page_id', 'react_angry', 'react_haha', 'react_like', 'react_love', 'react_sad', 'react_wow', 'shares', 'empath_fear', 'empath_violence', 'empath_sadness', 'empath_anger', 'empath_aggression', 'empath_negative_emotion', 'empath_suffering', 'empath_death', 'nrc_anger', 'nrc_fear', 'nrc_sadness', 'nrc_disgust', 'nrc_joy', 'nrc_trust', 'nrc_anticipation', 'nrc_surprise', 'flesch', 'smog', 'dale_chall', 'word_count', 'sentence_count', 'toxicity']

Embedding matrix shape: (18908, 384)
Meta matrix shape: (18908, 32)
Final X shape: (18908, 416)
y shape: (18908,)
Label distribution: {0: np.int64(6349), 1: np.int64(6144), 2: np.int64(6415)}

Saved ptsd_secondary_dataset.npz
Saved ptsd_secondary_dataset.csv (for inspection).

Files saved:
 - /content/ptsd_secondary_dataset.npz
 - /content/ptsd_secondary_dataset.csv
