In [30]:
# Cell 1 — Load & quick dataset inspection
from pathlib import Path
import pandas as pd

DATA_PATH = Path("dreaddit_StressAnalysis - Sheet1.csv")  # change this if your CSV has a different name/path

# load
df = pd.read_csv(DATA_PATH)

# prints to copy back here
print("1) dataset path:", DATA_PATH.resolve())
print("2) shape (rows, cols):", df.shape)
print("3) first 40 column names:")
print(df.columns.tolist()[:40])
print("4) label value counts (include NaNs):")
print(df['label'].value_counts(dropna=False))
print("5) first 5 rows (text column and label):")
print(df[['text','label']].head(5).to_string(index=False))


1) dataset path: C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_StressAnalysis - Sheet1.csv
2) shape (rows, cols): (715, 116)
3) first 40 column names:
['id', 'subreddit', 'post_id', 'sentence_range', 'text', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect']
4) label value counts (include NaNs):
label
1    369
0    346
Name: count, dtype: int64
5) first 5 rows (text column and label):
                  

In [31]:
# Cell 2 — Inspect numeric/lexical columns & missingness
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt', quiet=True)

# df already loaded from previous step; if not, load it:
# from pathlib import Path
# df = pd.read_csv(Path("dreaddit_StressAnalysis - Sheet1.csv"))

print("1) total columns:", len(df.columns))
print("2) sample columns (first 80):")
print(df.columns.tolist()[:80])

# Identify LIWC, DAL, syntax, social columns
liwc_cols = [c for c in df.columns if c.startswith("lex_liwc_")]
dal_cols  = [c for c in df.columns if c.startswith("lex_dal_")]
syntax_cols = [c for c in df.columns if c.startswith("syntax_") or c in ("syntax_ari","syntax_fk_grade")]
social_cols = [c for c in df.columns if c.startswith("social_")]

print(f"3) counts -> LIWC: {len(liwc_cols)}, DAL: {len(dal_cols)}, SYNTAX: {len(syntax_cols)}, SOCIAL: {len(social_cols)}")

# Ensure token_len / char_len present
if 'token_len' not in df.columns or 'char_len' not in df.columns:
    df['char_len'] = df['text'].astype(str).apply(len)
    df['token_len'] = df['text'].astype(str).apply(lambda t: len(word_tokenize(str(t))))

# Candidate numeric features (keep sentiment if numeric)
candidates = liwc_cols + dal_cols + syntax_cols + social_cols + ['token_len','char_len']
if 'sentiment' in df.columns and pd.api.types.is_numeric_dtype(df['sentiment']):
    candidates.append('sentiment')

# dedupe and keep only present numeric columns
numeric_candidates = [c for c in pd.unique(candidates) if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
print("4) numeric candidate count:", len(numeric_candidates))
print("5) numeric candidate sample (first 60):")
print(numeric_candidates[:60])

# Show top 10 LIWC features by variance (so we see the same numbers you mentioned)
if liwc_cols:
    liwc_var = df[liwc_cols].var().sort_values(ascending=False)
    print("\n6) Top 10 LIWC features by variance:")
    print(liwc_var.head(10).to_string())

# Missingness summary for numeric candidates
print("\n7) Missing values (numeric candidates):")
missing = df[numeric_candidates].isna().sum().sort_values(ascending=False)
print(missing[missing > 0].to_string() if missing.sum()>0 else "No missing values in numeric candidates")

# Basic stats shapes
print("\n8) Final numeric matrix shape (rows, cols):", (df.shape[0], len(numeric_candidates)))
print("9) label distribution again:")
print(df['label'].value_counts(dropna=False))


1) total columns: 116
2) sample columns (first 80):
['id', 'subreddit', 'post_id', 'sentence_range', 'text', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_anger', 'lex_liwc_sad', 'lex_liwc_social', 'lex_liwc_family', 'lex_liwc_friend', 'lex_liwc_female', 'lex_liwc_male', 'lex_liwc_cogproc', 'lex_liwc_insight', 'lex_liwc_cause', 'lex_liwc_discrep', 'lex_liwc_tentat

  numeric_candidates = [c for c in pd.unique(candidates) if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]


In [32]:
# Cell — Do 80/20 stratified split and save raw CSVs (run this now)
from pathlib import Path
from sklearn.model_selection import train_test_split
import pandas as pd

OUT_DIR = Path("dreaddit_cv_raw_splits")
OUT_DIR.mkdir(exist_ok=True, parents=True)

RANDOM_STATE = 42
TEST_SIZE = 0.20

# use df loaded earlier
X = df[[c for c in df.columns if c in numeric_candidates]].copy()  # numeric matrix (raw)
y = df['label'].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# Save raw train and frozen test (include original index as column for traceability)
train_raw = pd.concat([X_train.reset_index().rename(columns={'index':'orig_index'}), y_train.reset_index(drop=True)], axis=1)
test_raw  = pd.concat([X_test.reset_index().rename(columns={'index':'orig_index'}), y_test.reset_index(drop=True)], axis=1)

train_path = OUT_DIR / "train_raw.csv"
test_path  = OUT_DIR / "test_frozen_raw.csv"
train_raw.to_csv(train_path, index=False)
test_raw.to_csv(test_path, index=False)

# Print outputs for you to paste back
print("Saved train_raw ->", train_path.resolve())
print("Saved test_frozen_raw ->", test_path.resolve())
print("\nShapes:")
print("Full dataset:", df.shape)
print("X (numeric) shape:", X.shape)
print("Train shape (X_train):", X_train.shape)
print("Test shape  (X_test):", X_test.shape)

print("\nClass counts (original):")
print(y.value_counts())

print("\nClass counts (train):")
print(y_train.value_counts())

print("\nClass counts (test):")
print(y_test.value_counts())

print("\nFirst 6 rows of the frozen test (showing orig_index, label):")
print(test_raw[['orig_index','label']].head(6).to_string(index=False))


Saved train_raw -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\train_raw.csv
Saved test_frozen_raw -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\test_frozen_raw.csv

Shapes:
Full dataset: (715, 118)
X (numeric) shape: (715, 111)
Train shape (X_train): (572, 111)
Test shape  (X_test): (143, 111)

Class counts (original):
label
1    369
0    346
Name: count, dtype: int64

Class counts (train):
label
1    295
0    277
Name: count, dtype: int64

Class counts (test):
label
1    74
0    69
Name: count, dtype: int64

First 6 rows of the frozen test (showing orig_index, label):
 orig_index  label
        685      0
        532      1
        268      1
        507      1
        465      1
        519      1


In [36]:
# Cell — Stratified 5-fold CV on training set, saving RAW fold-wise CSVs (run this now)
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold

OUT_DIR = Path("dreaddit_cv_raw_splits")
FOLDS_DIR = OUT_DIR / "folds_raw"
FOLDS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
N_SPLITS = 5

# Load the train_raw you created (it includes orig_index and label)
train_path = OUT_DIR / "train_raw.csv"
if not train_path.exists():
    raise FileNotFoundError(f"Expected train_raw.csv at: {train_path.resolve()} — run the previous split cell first.")

train_raw = pd.read_csv(train_path)
print("Loaded train_raw:", train_path.resolve())
print("train_raw shape:", train_raw.shape)
print("Columns sample:", train_raw.columns.tolist()[:10])

# Prepare X (features) and y (label)
if 'label' not in train_raw.columns:
    raise KeyError("'label' column not found in train_raw.csv")

# Keep all columns except 'label' and 'orig_index' as features (raw)
feature_cols = [c for c in train_raw.columns if c not in ('label','orig_index')]
X_train = train_raw[feature_cols].reset_index(drop=True)
y_train = train_raw['label'].reset_index(drop=True)

# Stratified K-fold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

fold_summaries = []
fold_idx = 0
for train_idx, val_idx in skf.split(X_train, y_train):
    fold_idx += 1
    X_tr_raw = X_train.iloc[train_idx].reset_index(drop=True)
    X_val_raw = X_train.iloc[val_idx].reset_index(drop=True)
    y_tr_raw = y_train.iloc[train_idx].reset_index(drop=True)
    y_val_raw = y_train.iloc[val_idx].reset_index(drop=True)

    train_out = pd.concat([X_tr_raw, y_tr_raw.rename('label')], axis=1)
    val_out   = pd.concat([X_val_raw, y_val_raw.rename('label')], axis=1)

    train_file = FOLDS_DIR / f"fold_{fold_idx:02d}_train_raw.csv"
    val_file   = FOLDS_DIR / f"fold_{fold_idx:02d}_val_raw.csv"

    train_out.to_csv(train_file, index=False)
    val_out.to_csv(val_file, index=False)

    # summary info
    fold_summaries.append({
        "fold": fold_idx,
        "train_rows": train_out.shape[0],
        "val_rows": val_out.shape[0],
        "train_path": str(train_file),
        "val_path": str(val_file),
        "train_label_counts": dict(y_tr_raw.value_counts()),
        "val_label_counts": dict(y_val_raw.value_counts())
    })

    print(f"Saved fold {fold_idx:02d}: train {train_out.shape}, val {val_out.shape} ->")
    print(f"  {train_file.name}, {val_file.name}")
    print(f"  train label dist: {y_tr_raw.value_counts().to_dict()}")
    print(f"  val   label dist: {y_val_raw.value_counts().to_dict()}")
    print("-" * 60)

# Save manifest
manifest_df = pd.DataFrame(fold_summaries)
manifest_df.to_csv(OUT_DIR / "fold_manifest_raw.csv", index=False)

print("\nManifest saved to:", (OUT_DIR / "fold_manifest_raw.csv").resolve())
print("All fold files saved in:", FOLDS_DIR.resolve())

# Print head of saved files to verify
for p in sorted(FOLDS_DIR.glob("*.csv")):
    print("\nFile:", p.name)
    df_tmp = pd.read_csv(p)
    print(" shape:", df_tmp.shape)
    print(df_tmp.head(2).to_string(index=False))


Loaded train_raw: C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\train_raw.csv
train_raw shape: (572, 113)
Columns sample: ['orig_index', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS']
Saved fold 01: train (457, 112), val (115, 112) ->
  fold_01_train_raw.csv, fold_01_val_raw.csv
  train label dist: {1: 236, 0: 221}
  val   label dist: {1: 59, 0: 56}
------------------------------------------------------------
Saved fold 02: train (457, 112), val (115, 112) ->
  fold_02_train_raw.csv, fold_02_val_raw.csv
  train label dist: {1: 236, 0: 221}
  val   label dist: {1: 59, 0: 56}
------------------------------------------------------------
Saved fold 03: train (458, 112), val (114, 112) ->
  fold_03_train_raw.csv, fold_03_val_raw.csv
  train label dist: {1: 236, 0: 222}
  val   label dist: {1: 59, 0: 55}
-----------------------------------