In [3]:
# Fixed Cell — create clean_text and merge into saved train/test splits
import re
from pathlib import Path
import pandas as pd

DATA_PATH = Path("dreaddit_StressAnalysis - Sheet1.csv")
OUT_DIR = Path("dreaddit_cv_raw_splits")
TRAIN_PATH = OUT_DIR / "train_raw.csv"
TEST_PATH  = OUT_DIR / "test_frozen_raw.csv"

def clean_text_func(t):
    if pd.isna(t):
        return ""
    text = str(t).lower()

    # remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

    # remove emails
    text = re.sub(r'\S+@\S+\.\S+', ' ', text)

    # remove markdown links [text](url)
    text = re.sub(r'\[.*?\]\(.*?\)', ' ', text)

    # remove user mentions @name or u/name
    text = re.sub(r'@[A-Za-z0-9_]+', ' ', text)
    text = re.sub(r'u/[A-Za-z0-9_]+', ' ', text)

    # remove fenced code blocks ```...```
    text = re.sub(r'```.*?```', ' ', text, flags=re.DOTALL)

    # remove inline code `...`
    text = re.sub(r'`[^`]*`', ' ', text)

    # remove runs of asterisks/underscores (markdown emphasis) safely
    text = re.sub(r'[\*_]{1,}', ' ', text)

    # remove blockquote lines starting with >
    text = re.sub(r'(^|\n)>\s*.*', ' ', text)

    # keep letters, numbers, space, ? and !
    text = re.sub(r'[^a-z0-9?! ]+', ' ', text)

    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load original CSV and create clean_text
orig = pd.read_csv(DATA_PATH)
orig['clean_text'] = orig['text'].apply(clean_text_func)

# Load saved splits
if not TRAIN_PATH.exists() or not TEST_PATH.exists():
    raise FileNotFoundError(f"Expected saved splits at {TRAIN_PATH} and {TEST_PATH}. Run earlier split cells first.")

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)

# Merge clean_text into train/test using orig_index
if 'orig_index' not in train_raw.columns or 'orig_index' not in test_raw.columns:
    raise KeyError("train_raw.csv or test_frozen_raw.csv missing 'orig_index' column.")

train_merged = train_raw.merge(orig[['clean_text']], left_on='orig_index', right_index=True, how='left')
test_merged  = test_raw.merge(orig[['clean_text']],  left_on='orig_index', right_index=True, how='left')

# Save updated files
train_out_path = OUT_DIR / "train_raw_with_clean_text.csv"
test_out_path  = OUT_DIR / "test_frozen_raw_with_clean_text.csv"

train_merged.to_csv(train_out_path, index=False)
test_merged.to_csv(test_out_path, index=False)

# Print results
print("Saved updated train ->", train_out_path.resolve())
print("Saved updated test  ->", test_out_path.resolve())
print()
print("train_merged shape:", train_merged.shape)
print("test_merged shape: ", test_merged.shape)
print()
print("label distribution (train):")
print(train_merged['label'].value_counts().to_string())
print()
print("label distribution (test):")
print(test_merged['label'].value_counts().to_string())
print()
print("Sample clean_text (train):")
print(train_merged[['orig_index','label','clean_text']].head(5).to_string(index=False))
print()
print("Sample clean_text (test):")
print(test_merged[['orig_index','label','clean_text']].head(5).to_string(index=False))


Saved updated train -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\train_raw_with_clean_text.csv
Saved updated test  -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\test_frozen_raw_with_clean_text.csv

train_merged shape: (572, 114)
test_merged shape:  (143, 114)

label distribution (train):
label
1    295
0    277

label distribution (test):
label
1    74
0    69

Sample clean_text (train):
 orig_index  label                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [4]:
# Cell — TF-IDF: fit on train_clean_text, transform train & frozen test, save artifacts
from pathlib import Path
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

OUT_DIR = Path("dreaddit_cv_raw_splits")
TRAIN_IN = OUT_DIR / "train_raw_with_clean_text.csv"
TEST_IN  = OUT_DIR / "test_frozen_raw_with_clean_text.csv"
VECT_DIR = OUT_DIR / "tfidf"
VECT_DIR.mkdir(exist_ok=True, parents=True)

# TF-IDF hyperparams (edit before running if you want)
ngram_range = (1,2)
min_df = 5           # min doc freq (int) or float (proportion)
max_df = 0.9         # max doc freq (float proportion) or int
max_features = 50000 # None or int

print("Loading train/test with clean_text...")
train_df = pd.read_csv(TRAIN_IN)
test_df  = pd.read_csv(TEST_IN)

print("Train shape:", train_df.shape)
print("Test  shape:", test_df.shape)

if 'clean_text' not in train_df.columns or 'clean_text' not in test_df.columns:
    raise KeyError("clean_text column missing in train/test files. Run cleaning merge step first.")

# Fit vectorizer on train.clean_text only
tfidf = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df, max_features=max_features, strip_accents='unicode', lowercase=True)
print("Fitting TF-IDF on train clean_text ...")
X_train_tfidf = tfidf.fit_transform(train_df['clean_text'].fillna('').astype(str))
print("Fitted. Vocabulary size:", len(tfidf.vocabulary_))

# Transform frozen test
X_test_tfidf = tfidf.transform(test_df['clean_text'].fillna('').astype(str))

# Save vectorizer and sparse matrices
joblib.dump(tfidf, VECT_DIR / "tfidf_vectorizer.joblib")
sp.save_npz(VECT_DIR / "X_train_tfidf.npz", X_train_tfidf)
sp.save_npz(VECT_DIR / "X_test_tfidf.npz", X_test_tfidf)

print("\nSaved vectorizer ->", (VECT_DIR / "tfidf_vectorizer.joblib").resolve())
print("Saved X_train_tfidf ->", (VECT_DIR / "X_train_tfidf.npz").resolve())
print("Saved X_test_tfidf  ->", (VECT_DIR / "X_test_tfidf.npz").resolve())

# Print shape + sparsity info
def sparse_info(mat, name):
    nnz = mat.nnz
    shape = mat.shape
    density = nnz / (shape[0]*shape[1])
    print(f"\n{name} shape: {shape}, nnz: {nnz}, density: {density:.6f}")

sparse_info(X_train_tfidf, "X_train_tfidf")
sparse_info(X_test_tfidf, "X_test_tfidf")

# Show top 20 features by idf (lowest idf => most common; highest idf => rare)
import numpy as np
idf = np.array(tfidf.idf_)
top_common_idx = np.argsort(idf)[:20]
top_rare_idx   = np.argsort(-idf)[:20]
vocab = np.array(tfidf.get_feature_names_out())
print("\nTop 20 most common n-grams (lowest idf):")
print(vocab[top_common_idx].tolist())
print("\nTop 20 rarest n-grams (highest idf):")
print(vocab[top_rare_idx].tolist())

# Print sample row nonzero counts for first 6 train & test rows
train_row_nnz = (X_train_tfidf != 0).sum(axis=1).A1
test_row_nnz  = (X_test_tfidf != 0).sum(axis=1).A1
print("\nSample train nonzero counts (first 6 rows):", train_row_nnz[:6].tolist())
print("Sample test nonzero counts  (first 6 rows):", test_row_nnz[:6].tolist())


Loading train/test with clean_text...
Train shape: (572, 114)
Test  shape: (143, 114)
Fitting TF-IDF on train clean_text ...
Fitted. Vocabulary size: 2051

Saved vectorizer -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\tfidf\tfidf_vectorizer.joblib
Saved X_train_tfidf -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\tfidf\X_train_tfidf.npz
Saved X_test_tfidf  -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\tfidf\X_test_tfidf.npz

X_train_tfidf shape: (572, 2051), nnz: 37944, density: 0.032343

X_test_tfidf shape: (143, 2051), nnz: 8627, density: 0.029414

Top 20 most common n-grams (lowest idf):
['and', 'the', 'my', 'of', 'it', 'in', 'that', 'me', 'for', 'but', 'with', 'is', 'this', 'have', 'was', 'on', 'so', 'not', 'like', 'or']

Top 20 rarest n-grams (highest idf):
['x200b', 'writing', 'would love', 'afraid to', 'advice on', 'advance', 'pass', 'over to'

In [5]:
# Cell — TruncatedSVD on TF-IDF (200 components) — fit on train only
from pathlib import Path
import joblib
import numpy as np
import scipy.sparse as sp
from sklearn.decomposition import TruncatedSVD

OUT_DIR = Path("dreaddit_cv_raw_splits")
VECT_DIR = OUT_DIR / "tfidf"
SVD_DIR  = OUT_DIR / "svd"
SVD_DIR.mkdir(exist_ok=True, parents=True)

# paths
X_train_path = VECT_DIR / "X_train_tfidf.npz"
X_test_path  = VECT_DIR / "X_test_tfidf.npz"
tfidf_path   = VECT_DIR / "tfidf_vectorizer.joblib"

if not X_train_path.exists() or not X_test_path.exists() or not tfidf_path.exists():
    raise FileNotFoundError("TF-IDF artifacts missing. Run TF-IDF cell first.")

print("Loading TF-IDF matrices...")
X_train_tfidf = sp.load_npz(X_train_path)
X_test_tfidf  = sp.load_npz(X_test_path)

print("Shapes — train:", X_train_tfidf.shape, " test:", X_test_tfidf.shape)

# set n_components
n_components = 200
if n_components >= X_train_tfidf.shape[1]:
    raise ValueError(f"n_components ({n_components}) must be < n_features ({X_train_tfidf.shape[1]})")

print(f"Fitting TruncatedSVD with n_components = {n_components} on train TF-IDF ...")
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)   # fit on train only
X_test_svd  = svd.transform(X_test_tfidf)

# Save model and arrays
joblib.dump(svd, SVD_DIR / "tfidf_svd_200.joblib")
np.save(SVD_DIR / "X_train_svd_200.npy", X_train_svd)
np.save(SVD_DIR / "X_test_svd_200.npy", X_test_svd)

# Print diagnostics
print("Saved SVD model ->", (SVD_DIR / "tfidf_svd_200.joblib").resolve())
print("Saved X_train_svd ->", (SVD_DIR / "X_train_svd_200.npy").resolve())
print("Saved X_test_svd  ->", (SVD_DIR / "X_test_svd_200.npy").resolve())

explained = svd.explained_variance_ratio_.sum()
print(f"\nExplained variance (sum of top {n_components} components): {explained:.4f}")
print("Explained variance (first 10 components):", svd.explained_variance_ratio_[:10].tolist())

print("\nSVD output shapes:")
print("X_train_svd shape:", X_train_svd.shape)
print("X_test_svd shape :", X_test_svd.shape)

# Preview numeric ranges / sample rows
print("\nSample train SVD row (first 3 rows, first 6 dims):")
print(np.round(X_train_svd[:3,:6], 6))
print("\nSample test SVD row (first 3 rows, first 6 dims):")
print(np.round(X_test_svd[:3,:6], 6))

# Save small metadata
with open(SVD_DIR / "svd_info.txt", "w") as f:
    f.write(f"n_components={n_components}\n")
    f.write(f"explained_variance_sum={explained}\n")
    f.write("explained_variance_first10=" + ",".join(map(str, svd.explained_variance_ratio_[:10].tolist())) + "\n")

print("\nDone — paste the printed output here and I will continue with lexical feature processing and fusion.")


Loading TF-IDF matrices...
Shapes — train: (572, 2051)  test: (143, 2051)
Fitting TruncatedSVD with n_components = 200 on train TF-IDF ...
Saved SVD model -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\svd\tfidf_svd_200.joblib
Saved X_train_svd -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\svd\X_train_svd_200.npy
Saved X_test_svd  -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\svd\X_test_svd_200.npy

Explained variance (sum of top 200 components): 0.6561
Explained variance (first 10 components): [0.003290324031305188, 0.01618299569160921, 0.015497219632266246, 0.010922794256107135, 0.010451368146609291, 0.008473328006405134, 0.0070924188533376, 0.006683235428587484, 0.00654402249290138, 0.006318146476979274]

SVD output shapes:
X_train_svd shape: (572, 200)
X_test_svd shape : (143, 200)

Sample train SVD row (first 3 rows, first 6 dims):
[[ 0.292626 -0

In [6]:
# Cell — Lexical feature processing: impute (train-only) -> StandardScale (train-only) -> save dense arrays
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

OUT_DIR = Path("dreaddit_cv_raw_splits")
TRAIN_IN = OUT_DIR / "train_raw_with_clean_text.csv"
TEST_IN  = OUT_DIR / "test_frozen_raw_with_clean_text.csv"
LEX_DIR  = OUT_DIR / "lexical"
LEX_DIR.mkdir(exist_ok=True, parents=True)

# Load
train_df = pd.read_csv(TRAIN_IN)
test_df  = pd.read_csv(TEST_IN)

print("Loaded train/test with clean_text:")
print(" train:", train_df.shape)
print(" test :", test_df.shape)

# Identify lexical columns
liwc_cols = [c for c in train_df.columns if c.startswith("lex_liwc_")]
dal_cols  = [c for c in train_df.columns if c.startswith("lex_dal_")]
syntax_cols = [c for c in train_df.columns if c.startswith("syntax_") or c in ("syntax_ari","syntax_fk_grade")]
social_cols = [c for c in train_df.columns if c.startswith("social_")]
extra = [c for c in ("sentiment","token_len","char_len") if c in train_df.columns]

lexical_cols = liwc_cols + dal_cols + syntax_cols + social_cols + extra
# dedupe while preserving order
seen = set(); lexical_cols = [x for x in lexical_cols if not (x in seen or seen.add(x))]

print("\nLexical columns detected:", len(lexical_cols))
print("Sample (first 40):", lexical_cols[:40])

# Build matrices (keep orig_index for traceability)
train_lex = train_df[lexical_cols].copy()
test_lex  = test_df[lexical_cols].copy()

# Missingness report
missing_train = train_lex.isna().sum().sort_values(ascending=False)
missing_test  = test_lex.isna().sum().sort_values(ascending=False)
print("\nMissing values in train (columns with missing > 0):")
print(missing_train[missing_train>0].to_string() if missing_train.sum()>0 else "No missing values in train lexical columns")
print("\nMissing values in test (columns with missing > 0):")
print(missing_test[missing_test>0].to_string() if missing_test.sum()>0 else "No missing values in test lexical columns")

# Imputer fit on train only
imputer = SimpleImputer(strategy='mean')
imputer.fit(train_lex)
train_lex_imp = pd.DataFrame(imputer.transform(train_lex), columns=lexical_cols)
test_lex_imp  = pd.DataFrame(imputer.transform(test_lex), columns=lexical_cols)

# Save imputer
joblib.dump(imputer, LEX_DIR / "lex_imputer.joblib")

# Scaler fit on train only
scaler = StandardScaler()
scaler.fit(train_lex_imp)
train_lex_scaled = pd.DataFrame(scaler.transform(train_lex_imp), columns=lexical_cols)
test_lex_scaled  = pd.DataFrame(scaler.transform(test_lex_imp), columns=lexical_cols)

# Save scaler and arrays
joblib.dump(scaler, LEX_DIR / "lex_scaler.joblib")
np.save(LEX_DIR / "X_train_lexical.npy", train_lex_scaled.values)
np.save(LEX_DIR / "X_test_lexical.npy", test_lex_scaled.values)

print("\nSaved lexical artifacts to:", LEX_DIR.resolve())
print("\nShapes after processing:")
print(" X_train_lexical:", train_lex_scaled.shape)
print(" X_test_lexical :", test_lex_scaled.shape)

# Per-column mean/std (should be ~0/1 on train)
means = train_lex_scaled.mean().round(6)
stds  = train_lex_scaled.std().round(6)
print("\nTrain lexical columns mean (first 10):")
print(means.head(10).to_string())
print("\nTrain lexical columns std (first 10):")
print(stds.head(10).to_string())

# Print sample rows
print("\nSample train_lex_scaled (first 3 rows, first 6 cols):")
print(np.round(train_lex_scaled.values[:3,:6], 6))
print("\nSample test_lex_scaled (first 3 rows, first 6 cols):")
print(np.round(test_lex_scaled.values[:3,:6], 6))

# Save column list
pd.Series(lexical_cols).to_csv(LEX_DIR / "lexical_columns_list.csv", index=False)
print("\nSaved lexical column list.")


Loaded train/test with clean_text:
 train: (572, 114)
 test : (143, 114)

Lexical columns detected: 111
Sample (first 40): ['lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_anger', 'lex_liwc_sad', 'lex_liwc_social', 'lex_liwc_family', 'lex_liwc_friend', 'lex_liwc_female', 'lex_liwc_male']

Missing values in train (columns with missing > 0):
No missing values in train lexical columns

Missing values in test (columns with missing > 0):
No m

In [7]:
# Cell — Feature fusion: concatenate SVD TF-IDF (200) + lexical dense (111) -> fused dense features
from pathlib import Path
import numpy as np
import pandas as pd
import os
import sys

OUT_DIR = Path("dreaddit_cv_raw_splits")
SVD_DIR = OUT_DIR / "svd"
LEX_DIR = OUT_DIR / "lexical"
FUSED_DIR = OUT_DIR / "fused"
FUSED_DIR.mkdir(exist_ok=True, parents=True)

# load arrays
X_train_svd = np.load(SVD_DIR / "X_train_svd_200.npy")
X_test_svd  = np.load(SVD_DIR / "X_test_svd_200.npy")
X_train_lex = np.load(LEX_DIR / "X_train_lexical.npy")
X_test_lex  = np.load(LEX_DIR / "X_test_lexical.npy")

print("Loaded arrays:")
print(" X_train_svd:", X_train_svd.shape)
print(" X_test_svd :", X_test_svd.shape)
print(" X_train_lex:", X_train_lex.shape)
print(" X_test_lex :", X_test_lex.shape)

# sanity check: number of rows must match
assert X_train_svd.shape[0] == X_train_lex.shape[0], "Train row mismatch between SVD and lexical"
assert X_test_svd.shape[0]  == X_test_lex.shape[0],  "Test row mismatch between SVD and lexical"

# Concatenate horizontally
X_train_fused = np.hstack([X_train_svd, X_train_lex])
X_test_fused  = np.hstack([X_test_svd, X_test_lex])

# Save fused arrays
np.save(FUSED_DIR / "X_train_fused.npy", X_train_fused)
np.save(FUSED_DIR / "X_test_fused.npy", X_test_fused)

# Save manifest
manifest = {
    "X_train_svd_shape": X_train_svd.shape,
    "X_train_lex_shape": X_train_lex.shape,
    "X_train_fused_shape": X_train_fused.shape,
    "X_test_svd_shape": X_test_svd.shape,
    "X_test_lex_shape": X_test_lex.shape,
    "X_test_fused_shape": X_test_fused.shape,
    "svd_components": X_train_svd.shape[1],
    "lexical_features": X_train_lex.shape[1],
    "fused_features": X_train_fused.shape[1]
}
pd.Series(manifest).to_csv(FUSED_DIR / "fused_manifest.csv", index=True)

# Print info
def approx_bytes(arr):
    return arr.nbytes

print("\nFused arrays saved to:", FUSED_DIR.resolve())
print("\nFused shapes:")
print(" X_train_fused:", X_train_fused.shape)
print(" X_test_fused :", X_test_fused.shape)
print("\nDtypes:", X_train_fused.dtype, X_test_fused.dtype)
print("\nApprox memory (bytes):")
print(" train:", approx_bytes(X_train_fused))
print(" test :", approx_bytes(X_test_fused))

# Print small numeric preview
print("\nSample X_train_fused first 3 rows (first 8 dims):")
print(np.round(X_train_fused[:3,:8], 6))
print("\nSample X_test_fused first 3 rows (first 8 dims):")
print(np.round(X_test_fused[:3,:8], 6))

# Show fused features count
print("\nFeature counts -> SVD:", X_train_svd.shape[1], "LEX:", X_train_lex.shape[1], "FUSED:", X_train_fused.shape[1])


Loaded arrays:
 X_train_svd: (572, 200)
 X_test_svd : (143, 200)
 X_train_lex: (572, 111)
 X_test_lex : (143, 111)

Fused arrays saved to: C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\fused

Fused shapes:
 X_train_fused: (572, 311)
 X_test_fused : (143, 311)

Dtypes: float64 float64

Approx memory (bytes):
 train: 1423136
 test : 355784

Sample X_train_fused first 3 rows (first 8 dims):
[[ 0.292626 -0.084161 -0.039586 -0.070352 -0.002818 -0.006795 -0.015907
   0.042966]
 [ 0.206315 -0.039224  0.032789 -0.042089  0.031427 -0.098834 -0.082695
  -0.057838]
 [ 0.261952  0.106224  0.036714 -0.003368 -0.225875 -0.025593  0.047436
  -0.089772]]

Sample X_test_fused first 3 rows (first 8 dims):
[[ 2.87179e-01 -1.06008e-01 -1.25240e-02  2.92000e-04 -4.82160e-02
   5.52900e-03  7.96800e-03 -5.59310e-02]
 [ 3.00614e-01 -5.67200e-03 -6.10030e-02  6.16830e-02 -4.11990e-02
   4.87370e-02  2.52660e-02 -3.05440e-02]
 [ 2.76051e-01 -1.27460e-01 -4.00740e-02 -

In [8]:
# Cell — L1 Logistic Regression Feature Selection on fused features
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

OUT_DIR = Path("dreaddit_cv_raw_splits")
FUSED_DIR = OUT_DIR / "fused"
SEL_DIR = OUT_DIR / "selected_features"
SEL_DIR.mkdir(exist_ok=True, parents=True)

# Load fused features
X_train_fused = np.load(FUSED_DIR / "X_train_fused.npy")
X_test_fused  = np.load(FUSED_DIR / "X_test_fused.npy")

print("Loaded fused:")
print(" X_train_fused:", X_train_fused.shape)
print(" X_test_fused :", X_test_fused.shape)

# Load labels
train_df = pd.read_csv(OUT_DIR / "train_raw_with_clean_text.csv")
y_train = train_df["label"].values

print("Loaded labels:", y_train.shape)

# Fit L1-regularized Logistic Regression (Linear Model)
# Smaller C => stronger regularization => more feature selection
clf = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    C=0.1,
    max_iter=2000,
    class_weight="balanced"
)

print("\nFitting L1 Logistic Regression on fused features...")
clf.fit(X_train_fused, y_train)
print("Done fitting.")

# Select features with non-zero coefficients
selector = SelectFromModel(clf, prefit=True, threshold="mean")

selected_mask = selector.get_support()
X_train_sel = selector.transform(X_train_fused)
X_test_sel  = selector.transform(X_test_fused)

# Save artifacts
np.save(SEL_DIR / "X_train_fused_selected.npy", X_train_sel)
np.save(SEL_DIR / "X_test_fused_selected.npy", X_test_sel)
np.save(SEL_DIR / "selected_mask.npy", selected_mask)

joblib.dump(selector, SEL_DIR / "selector_L1.joblib")
joblib.dump(clf, SEL_DIR / "L1_logistic_model.joblib")

print("\nSaved selected feature arrays and model artifacts to:", SEL_DIR.resolve())

# Report selected features
n_total = X_train_fused.shape[1]
n_selected = X_train_sel.shape[1]
print(f"\nFeature selection complete:")
print(f"  Total features: {n_total}")
print(f"  Selected features: {n_selected}")
print(f"  Reduction: {n_total - n_selected}")

selected_indices = np.where(selected_mask)[0]
print("\nFirst 20 selected feature indices:", selected_indices[:20])

# Preview numeric values
print("\nPreview of X_train_selected (first 3 rows, first 8 dims):")
print(np.round(X_train_sel[:3, :8], 6))

print("\nPreview of X_test_selected (first 3 rows, first 8 dims):")
print(np.round(X_test_sel[:3, :8], 6))


Loaded fused:
 X_train_fused: (572, 311)
 X_test_fused : (143, 311)
Loaded labels: (572,)

Fitting L1 Logistic Regression on fused features...
Done fitting.

Saved selected feature arrays and model artifacts to: C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\selected_features

Feature selection complete:
  Total features: 311
  Selected features: 34
  Reduction: 277

First 20 selected feature indices: [202 204 211 213 216 221 222 227 230 231 232 236 239 241 243 246 248 250
 251 261]

Preview of X_train_selected (first 3 rows, first 8 dims):
[[-0.626649 -0.902659 -0.810858 -0.385364  0.347508  0.31842  -0.478641
   0.763188]
 [-0.111885  0.833614  1.661692 -0.385364 -0.694275 -1.391037 -1.220262
   0.249805]
 [ 0.534795 -0.874243  0.443465 -0.385364  1.171244  1.266168 -0.489013
  -0.690551]]

Preview of X_test_selected (first 3 rows, first 8 dims):
[[-0.684383  1.826433  0.330666  0.407265  0.514072 -1.068101 -0.421593
  -0.690551]
 [-0.646324 

In [12]:
# Cell — Prepare per-fold selected feature datasets (class_weight strategy)
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

ROOT = Path("dreaddit_cv_raw_splits")
FOLDS_RAW = ROOT / "folds_raw"
TFIDF_DIR = ROOT / "tfidf"
SVD_DIR   = ROOT / "svd"
LEX_DIR   = ROOT / "lexical"
SEL_DIR   = ROOT / "selected_features"
TRAIN_WITH_CLEAN = ROOT / "train_raw_with_clean_text.csv"

OUT_DIR = ROOT / "folds_selected"
OUT_DIR.mkdir(exist_ok=True, parents=True)

# artifacts (must exist)
tfidf_path = TFIDF_DIR / "tfidf_vectorizer.joblib"
svd_path   = SVD_DIR / "tfidf_svd_200.joblib"
imputer_path = LEX_DIR / "lex_imputer.joblib"
scaler_path  = LEX_DIR / "lex_scaler.joblib"
selector_path = SEL_DIR / "selector_L1.joblib"
lex_cols_path = LEX_DIR / "lexical_columns_list.csv"

for p in (tfidf_path, svd_path, imputer_path, scaler_path, selector_path, lex_cols_path, TRAIN_WITH_CLEAN):
    if not p.exists():
        raise FileNotFoundError(f"Missing required artifact: {p}")

# load artifacts
tfidf = joblib.load(tfidf_path)
svd   = joblib.load(svd_path)
imputer = joblib.load(imputer_path)
scaler  = joblib.load(scaler_path)
selector = joblib.load(selector_path)
lex_cols = pd.read_csv(lex_cols_path, header=None)[0].tolist()

# load master train_with_clean (to get clean_text and lexical columns)
train_with_clean = pd.read_csv(TRAIN_WITH_CLEAN)
train_with_clean_indexed = train_with_clean.set_index("orig_index")
print("Loaded train_with_clean:", train_with_clean.shape)

# list fold files
fold_files = sorted([p for p in (FOLDS_RAW).glob("fold_*_train_raw.csv")])
if not fold_files:
    raise FileNotFoundError(f"No fold_train files found in {FOLDS_RAW}")

manifest = []

for train_file in fold_files:
    # infer fold number
    name = train_file.stem  # e.g., fold_01_train_raw
    fold_no = name.split("_")[1]
    val_file = FOLDS_RAW / f"fold_{fold_no}_val_raw.csv"
    if not val_file.exists():
        raise FileNotFoundError(f"Expected val file for fold {fold_no} at {val_file}")

    # load fold train & val raw (these include orig_index and label)
    df_tr = pd.read_csv(train_file)
    df_val = pd.read_csv(val_file)

    # fetch clean_text and lexical columns from master train_with_clean using orig_index
    # ensure orig_index exists
    if 'orig_index' not in df_tr.columns or 'orig_index' not in df_val.columns:
        raise KeyError("fold files must include 'orig_index' column")

    # helper to build fused features for a subset
    def build_fused_selected(sub_df):
        # sub_df has orig_index, label and numeric columns
        # get clean_text rows by orig_index from master
        idx = sub_df['orig_index'].values
        clean_texts = train_with_clean_indexed.loc[idx, 'clean_text'].astype(str).values

        # TF-IDF transform -> SVD transform
        X_tfidf = tfidf.transform(clean_texts)
        X_svd = svd.transform(X_tfidf)   # dense (n_rows, n_svd)

        # lexical matrix: pick lex_cols from sub_df (ensure columns present)
        missing_lex = [c for c in lex_cols if c not in sub_df.columns]
        if missing_lex:
            raise KeyError(f"Missing lexical columns in fold file: {missing_lex[:5]}... (total {len(missing_lex)})")
        lex_mat = sub_df[lex_cols].values

        # impute then scale using saved imputer/scaler (they were fitted on global train)
        lex_imp = imputer.transform(lex_mat)
        lex_scaled = scaler.transform(lex_imp)

        # fuse
        fused = np.hstack([X_svd, lex_scaled])

        # apply selector (SelectFromModel) to get selected dims
        fused_selected = selector.transform(fused)

        return fused_selected

    X_tr_sel = build_fused_selected(df_tr)
    X_val_sel = build_fused_selected(df_val)

    # Save arrays + CSVs with metadata (orig_index + label + selected feature columns)
    fold_out_dir = OUT_DIR / f"fold_{fold_no}"
    fold_out_dir.mkdir(parents=True, exist_ok=True)

    np.save(fold_out_dir / f"fold_{fold_no}_train_selected.npy", X_tr_sel)
    np.save(fold_out_dir / f"fold_{fold_no}_val_selected.npy", X_val_sel)

    # Save CSVs with orig_index and label plus selected features (column names x0,x1,...)
    sel_cols = [f"x{i}" for i in range(X_tr_sel.shape[1])]
    train_csv = pd.concat([df_tr[['orig_index','label']].reset_index(drop=True),
                           pd.DataFrame(X_tr_sel, columns=sel_cols)], axis=1)
    val_csv   = pd.concat([df_val[['orig_index','label']].reset_index(drop=True),
                           pd.DataFrame(X_val_sel, columns=sel_cols)], axis=1)

    train_csv.to_csv(fold_out_dir / f"fold_{fold_no}_train_selected.csv", index=False)
    val_csv.to_csv(fold_out_dir / f"fold_{fold_no}_val_selected.csv", index=False)

    manifest.append({
        "fold": fold_no,
        "train_rows": X_tr_sel.shape[0],
        "val_rows": X_val_sel.shape[0],
        "train_selected_path": str(fold_out_dir / f"fold_{fold_no}_train_selected.npy"),
        "val_selected_path": str(fold_out_dir / f"fold_{fold_no}_val_selected.npy"),
        "train_csv": str(fold_out_dir / f"fold_{fold_no}_train_selected.csv"),
        "val_csv": str(fold_out_dir / f"fold_{fold_no}_val_selected.csv")
    })

    print(f"Fold {fold_no}: saved selected train ({X_tr_sel.shape}) and val ({X_val_sel.shape})")

# Save manifest
manifest_df = pd.DataFrame(manifest)
manifest_df.to_csv(OUT_DIR / "folds_selected_manifest.csv", index=False)
print("\nSaved folds_selected_manifest ->", (OUT_DIR / "folds_selected_manifest.csv").resolve())

# Save an unfitted pipeline stub with class_weight='balanced' logistic regression
IMB_DIR = ROOT / "imbalance"
IMB_DIR.mkdir(exist_ok=True, parents=True)
pipe = Pipeline([("clf", LogisticRegression(class_weight='balanced'Y, solver='liblinear', max_iter=2000))])
joblib.dump(pipe, IMB_DIR / "pipeline_class_weight_balanced_unfitted.joblib")
print("Saved unfitted pipeline with class_weight='balanced' ->", (IMB_DIR / "pipeline_class_weight_balanced_unfitted.joblib").resolve())

# Print summary
print("\nSummary of folds prepared:")
print(manifest_df.to_string(index=False))

print("\nDone. You now have per-fold selected datasets (unfitted). The saved pipeline is configured to use class_weight='balanced' and is not fitted.")


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1830655867.py, line 139)

In [18]:
# Fixed robust fold-selection cell (no nonlocal, returns missing_count)
from pathlib import Path
import pandas as pd
import numpy as np
import joblib

ROOT = Path("dreaddit_cv_raw_splits")
FOLDS_RAW = ROOT / "folds_raw"
TFIDF_DIR = ROOT / "tfidf"
SVD_DIR   = ROOT / "svd"
LEX_DIR   = ROOT / "lexical"
SEL_DIR   = ROOT / "selected_features"
TRAIN_WITH_CLEAN = ROOT / "train_raw_with_clean_text.csv"
OUT_DIR = ROOT / "folds_selected"
OUT_DIR.mkdir(exist_ok=True, parents=True)

# artifact paths
tfidf_path = TFIDF_DIR / "tfidf_vectorizer.joblib"
svd_path   = SVD_DIR / "tfidf_svd_200.joblib"
imputer_path = LEX_DIR / "lex_imputer.joblib"
scaler_path  = LEX_DIR / "lex_scaler.joblib"
selector_path = SEL_DIR / "selector_L1.joblib"
lex_cols_path = LEX_DIR / "lexical_columns_list.csv"

for p in (tfidf_path, svd_path, imputer_path, scaler_path, selector_path, lex_cols_path, TRAIN_WITH_CLEAN):
    if not p.exists():
        raise FileNotFoundError(f"Missing required artifact: {p}")

# load artifacts
tfidf = joblib.load(tfidf_path)
svd   = joblib.load(svd_path)
imputer = joblib.load(imputer_path)
scaler  = joblib.load(scaler_path)
selector = joblib.load(selector_path)
lex_cols = pd.read_csv(lex_cols_path, header=None)[0].tolist()

# get imputer expected feature count
try:
    imp_n_features = int(imputer.statistics_.shape[0])
except Exception:
    imp_n_features = len(lex_cols)

print("Imputer expects features:", imp_n_features)
print("Len(lex_cols) from lexical_columns_list.csv:", len(lex_cols))

# load master
train_master = pd.read_csv(TRAIN_WITH_CLEAN)
train_master['orig_index'] = train_master['orig_index'].astype(int)
train_master_indexed = train_master.set_index('orig_index')
print("Loaded train_master shape:", train_master.shape)
print("Master lexical columns available:", len([c for c in train_master.columns if c in lex_cols]), "of", len(lex_cols))

# align lex_cols to imputer expectation
aligned_lex_cols = list(lex_cols)
if len(aligned_lex_cols) > imp_n_features:
    aligned_lex_cols = aligned_lex_cols[:imp_n_features]
    print(f"Trimming lex_cols -> {len(aligned_lex_cols)}")
elif len(aligned_lex_cols) < imp_n_features:
    pad_count = imp_n_features - len(aligned_lex_cols)
    pad_names = [f"__pad_lex_{i}" for i in range(pad_count)]
    aligned_lex_cols += pad_names
    print(f"Padding lex_cols with {pad_count} zero-columns -> total {len(aligned_lex_cols)}")

# prepare fold files
fold_train_files = sorted([p for p in FOLDS_RAW.glob("fold_*_train_raw.csv")])
if not fold_train_files:
    raise FileNotFoundError(f"No fold_train files found in {FOLDS_RAW}")

manifest = []
any_missing = []

def build_fused_selected_from_master(sub_df, fold_no):
    # returns fused_selected array and missing_count
    # get orig_index as ints where possible
    try:
        idx = sub_df['orig_index'].astype(int).values
    except Exception:
        idx = sub_df['orig_index'].values

    # lookup rows in master (order preserved)
    master_rows = train_master_indexed.reindex(idx)

    # handle missing clean_text / lexical gracefully
    if 'clean_text' in master_rows.columns:
        missing_mask = master_rows['clean_text'].isna()
        missing_count = int(missing_mask.sum())
        if missing_count > 0:
            any_missing.append((fold_no, missing_count))
            master_rows['clean_text'] = master_rows['clean_text'].fillna('')
    else:
        master_rows['clean_text'] = ''
        missing_count = 0

    clean_texts = master_rows['clean_text'].astype(str).values

    # TF-IDF -> SVD
    X_tfidf = tfidf.transform(clean_texts)
    X_svd = svd.transform(X_tfidf)

    # build aligned lexical matrix matching imp_n_features
    n_rows = len(master_rows)
    lex_mat = np.zeros((n_rows, imp_n_features), dtype=float)
    for j, col in enumerate(aligned_lex_cols):
        if col in master_rows.columns:
            lex_mat[:, j] = pd.to_numeric(master_rows[col].fillna(0), errors='coerce').fillna(0).astype(float).values
        else:
            lex_mat[:, j] = 0.0

    # impute & scale
    lex_imp = imputer.transform(lex_mat)
    lex_scaled = scaler.transform(lex_imp)

    # fuse & select
    fused = np.hstack([X_svd, lex_scaled])
    fused_selected = selector.transform(fused)
    return fused_selected, missing_count

for train_file in fold_train_files:
    name = train_file.stem
    fold_no = name.split("_")[1]
    val_file = FOLDS_RAW / f"fold_{fold_no}_val_raw.csv"
    if not val_file.exists():
        raise FileNotFoundError(f"Missing validation file for fold {fold_no}: {val_file}")

    df_tr = pd.read_csv(train_file)
    df_val = pd.read_csv(val_file)
    if 'orig_index' not in df_tr.columns or 'orig_index' not in df_val.columns:
        raise KeyError("fold files must include 'orig_index' column")

    X_tr_sel, miss_tr = build_fused_selected_from_master(df_tr, fold_no)
    X_val_sel, miss_val = build_fused_selected_from_master(df_val, fold_no)

    # save outputs
    fold_out = OUT_DIR / f"fold_{fold_no}"
    fold_out.mkdir(exist_ok=True, parents=True)

    np.save(fold_out / f"fold_{fold_no}_train_selected.npy", X_tr_sel)
    np.save(fold_out / f"fold_{fold_no}_val_selected.npy", X_val_sel)

    sel_cols = [f"x{i}" for i in range(X_tr_sel.shape[1])]
    train_csv = pd.concat([df_tr[['orig_index','label']].reset_index(drop=True),
                           pd.DataFrame(X_tr_sel, columns=sel_cols)], axis=1)
    val_csv   = pd.concat([df_val[['orig_index','label']].reset_index(drop=True),
                           pd.DataFrame(X_val_sel, columns=sel_cols)], axis=1)

    train_csv.to_csv(fold_out / f"fold_{fold_no}_train_selected.csv", index=False)
    val_csv.to_csv(fold_out / f"fold_{fold_no}_val_selected.csv", index=False)

    manifest.append({
        "fold": fold_no,
        "train_rows": X_tr_sel.shape[0],
        "val_rows": X_val_sel.shape[0],
        "selected_features": X_tr_sel.shape[1],
        "train_csv": str(fold_out / f"fold_{fold_no}_train_selected.csv"),
        "val_csv": str(fold_out / f"fold_{fold_no}_val_selected.csv")
    })

    print(f"Fold {fold_no} -> saved selected: train {X_tr_sel.shape}, val {X_val_sel.shape} (missing_rows train={miss_tr}, val={miss_val})")

# save manifest
manifest_df = pd.DataFrame(manifest)
manifest_df.to_csv(OUT_DIR / "folds_selected_manifest.csv", index=False)

print("\nSaved folds_selected_manifest ->", (OUT_DIR / "folds_selected_manifest.csv").resolve())
if any_missing:
    print("Warning: some folds had missing orig_index rows in the master. Details (fold_no, missing_count):")
    print(any_missing)
print("Done. Per-fold selected datasets recreated.")


Imputer expects features: 111
Len(lex_cols) from lexical_columns_list.csv: 112
Loaded train_master shape: (572, 114)
Master lexical columns available: 111 of 112
Trimming lex_cols -> 111




Fold 01 -> saved selected: train (457, 34), val (115, 34) (missing_rows train=0, val=0)
Fold 02 -> saved selected: train (457, 34), val (115, 34) (missing_rows train=0, val=0)




Fold 03 -> saved selected: train (458, 34), val (114, 34) (missing_rows train=0, val=0)
Fold 04 -> saved selected: train (458, 34), val (114, 34) (missing_rows train=0, val=0)
Fold 05 -> saved selected: train (458, 34), val (114, 34) (missing_rows train=0, val=0)

Saved folds_selected_manifest -> C:\Users\AYUSH SINGH\Documents\GitHub\NervSightX\Machine learning\dreaddit_cv_raw_splits\folds_selected\folds_selected_manifest.csv
Done. Per-fold selected datasets recreated.


