In [None]:

import os
import re
import json
import math
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

# Try to import TensorFlow (CNN/LSTM & contrastive). If unavailable, code will skip those with a note.
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, optimizers
    TF_AVAILABLE = True
except Exception as e:
    TF_AVAILABLE = False

# ---------------------- Configuration ----------------------
DATASETS = [
    "/mnt/data/Instagram Dataset 4322.csv",
    "/mnt/data/2022 temple_Mosque_624.csv",
    "/mnt/data/2021 Cast Full Data_442.csv",
    "/mnt/data/2021 Indian Politics Full Data_1205.csv",
    "/mnt/data/2021 Religious Conflicts Full Data_537.csv",
    "/mnt/data/2022 Hinduphobia_306.csv",
    "/mnt/data/2022 Historical_hindu_Mu_299.csv",
    "/mnt/data/2022 Islamophobia_284.csv",
    "/mnt/data/2022 Namaz_Public_52.csv",
    "/mnt/data/Youtube comment datset 1295.csv",
]

RESULTS_DIR = "/mnt/data/hate_speech_results_v2"
os.makedirs(RESULTS_DIR, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# If you know exact columns, set them here; otherwise auto-detection will be used.
COLUMN_MAP = {
    # "2022 temple_Mosque_624.csv": {"text": "comment_text", "label": "label"},
    # "2021 Cast Full Data_442.csv": {"text": "text", "label": "is_hate"},
}

# Vectorizer params for ML models
TFIDF_PARAMS_ML = dict(
    ngram_range=(1, 2),
    min_df=2,
    max_features=50000
)

# Smaller feature space for neural nets to keep memory reasonable
TFIDF_PARAMS_NN = dict(
    ngram_range=(1, 2),
    min_df=2,
    max_features=5000
)

TEXT_CANDIDATE_KEYWORDS = [
    "text", "comment", "content", "tweet", "message", "post", "body", "caption", "clean"
]
LABEL_CANDIDATE_KEYWORDS = [
    "label", "labels", "target", "class", "category", "hate", "is_hate", "toxic", "tag"
]

# ---------------------- Helpers ----------------------
def pick_text_and_label_columns(df: pd.DataFrame):
    cols_norm = {c: re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()) for c in df.columns}
    inv = {v: k for k, v in cols_norm.items()}
    # text
    text_col = None
    for key in TEXT_CANDIDATE_KEYWORDS:
        for norm, original in inv.items():
            if key in norm and df[original].dtype == object:
                text_col = original; break
        if text_col: break
    if text_col is None:
        obj_cols = [c for c in df.columns if df[c].dtype == object]
        if obj_cols:
            text_col = max(obj_cols, key=lambda c: df[c].fillna("").astype(str).str.len().mean())
    # label
    label_col = None
    for key in LABEL_CANDIDATE_KEYWORDS:
        for norm, original in inv.items():
            if key in norm and original != text_col:
                label_col = original; break
        if label_col: break
    if label_col is None:
        candidates = []
        for c in df.columns:
            if c == text_col: continue
            nunq = df[c].nunique(dropna=True)
            if nunq <= 6:
                candidates.append((c, nunq))
        if candidates:
            non_obj = [c for c in candidates if df[c[0]].dtype != object]
            label_col = (non_obj[0][0] if non_obj else sorted(candidates, key=lambda x: x[1])[0][0])
    return text_col, label_col

def clean_text(s: pd.Series) -> pd.Series:
    s = s.fillna("").astype(str)
    return s.str.replace(r"\s+", " ", regex=True).str.strip()

def normalize_binary_labels(y: pd.Series) -> pd.Series:
    mapping = {
        "hate":1, "hatespeech":1, "toxic":1, "offensive":1, "abusive":1, "hs":1, "yes":1, "y":1, "1":1, 1:1, True:1,
        "non-hate":0, "not_hate":0, "nonhate":0, "clean":0, "normal":0, "none":0, "no":0, "n":0, "0":0, 0:0, False:0
    }
    def m(v):
        if pd.isna(v): return np.nan
        if v in mapping: return mapping[v]
        sv = str(v).strip().lower()
        return mapping.get(sv, v)
    return y.map(m)

def compute_metrics(y_true, y_pred, labels_sorted):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro", zero_division=0
    )
    cm = confusion_matrix(y_true, y_pred, labels=labels_sorted)
    return acc, prec, rec, f1, cm

def save_cm(cm, labels_sorted, path):
    cm_df = pd.DataFrame(
        cm,
        index=[f"true_{l}" for l in labels_sorted],
        columns=[f"pred_{l}" for l in labels_sorted]
    )
    cm_df.to_csv(path)

# ---------------------- Classic ML ----------------------
def run_classic_ml(X_text, y, dataset_name):
    vec = TfidfVectorizer(**TFIDF_PARAMS_ML)
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(
        X_text, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y if pd.Series(y).nunique()>1 else None
    )
    X_train = vec.fit_transform(X_train_txt)
    X_test  = vec.transform(X_test_txt)

    models = {
        "SVM": LinearSVC(class_weight="balanced", random_state=RANDOM_SEED),
        "LR": LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear"),
        "NB": MultinomialNB(),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "DT": DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_SEED),
    }
    rows = []
    labels_sorted = sorted(pd.unique(y_test))
    for name, clf in models.items():
        try:
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            acc, prec, rec, f1, cm = compute_metrics(y_test, y_pred, labels_sorted)
            cm_path = os.path.join(RESULTS_DIR, f"{re.sub('[^A-Za-z0-9]+','_',dataset_name)}__{name}__confusion_matrix.csv")
            save_cm(cm, labels_sorted, cm_path)
            rows.append({
                "dataset": dataset_name, "model": name, "kind": "classic_ml",
                "n_train": len(y_train), "n_test": len(y_test),
                "classes": ";".join(map(str, labels_sorted)),
                "accuracy": round(acc,4), "precision_macro": round(prec,4),
                "recall_macro": round(rec,4), "f1_macro": round(f1,4),
                "confusion_matrix_path": cm_path
            })
        except Exception as e:
            rows.append({
                "dataset": dataset_name, "model": name, "kind": "classic_ml",
                "n_train": len(y_train), "n_test": len(y_test),
                "classes": ";".join(map(str, labels_sorted)),
                "accuracy": np.nan, "precision_macro": np.nan, "recall_macro": np.nan, "f1_macro": np.nan,
                "confusion_matrix_path": "", "error": str(e)
            })
    return rows

# ---------------------- CNN/LSTM with TF-IDF-as-sequence ----------------------
def build_cnn_model(seq_len, n_classes):
    model = models.Sequential([
        layers.Input(shape=(seq_len, 1)),
        layers.Conv1D(64, kernel_size=5, padding="same"),
        layers.ReLU(),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(64, kernel_size=3, padding="same"),
        layers.ReLU(),
        layers.GlobalMaxPooling1D(),
        layers.Dense(64), layers.ReLU(),
        layers.Dense(n_classes, activation="softmax" if n_classes>2 else "sigmoid")
    ])
    loss = "sparse_categorical_crossentropy" if n_classes>2 else "binary_crossentropy"
    model.compile(optimizer=optimizers.Adam(1e-3), loss=loss, metrics=["accuracy"])
    return model

def build_lstm_model(seq_len, n_classes):
    model = models.Sequential([
        layers.Input(shape=(seq_len, 1)),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(64), layers.ReLU(),
        layers.Dense(n_classes, activation="softmax" if n_classes>2 else "sigmoid")
    ])
    loss = "sparse_categorical_crossentropy" if n_classes>2 else "binary_crossentropy"
    model.compile(optimizer=optimizers.Adam(1e-3), loss=loss, metrics=["accuracy"])
    return model

def run_nn_tfidf(X_text, y, dataset_name, arch="CNN", epochs=3, batch_size=64):
    if not TF_AVAILABLE:
        return [{
            "dataset": dataset_name, "model": arch, "kind": "neural_tfidf",
            "n_train": 0, "n_test": 0, "classes": "", "accuracy": np.nan,
            "precision_macro": np.nan, "recall_macro": np.nan, "f1_macro": np.nan,
            "confusion_matrix_path": "", "error": "TensorFlow not available"
        }]

    vec = TfidfVectorizer(**TFIDF_PARAMS_NN)
    X_train_txt, X_test_txt, y_train_raw, y_test_raw = train_test_split(
        X_text, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y if pd.Series(y).nunique()>1 else None
    )
    X_train = vec.fit_transform(X_train_txt).astype(np.float32)
    X_test  = vec.transform(X_test_txt).astype(np.float32)

    # convert to dense and reshape to (seq_len, 1)
    X_train = X_train.toarray()
    X_test  = X_test.toarray()
    seq_len = X_train.shape[1]
    X_train = X_train.reshape((-1, seq_len, 1))
    X_test  = X_test.reshape((-1, seq_len, 1))

    # label encode to 0..C-1 for multiclass;
    # for binary we will keep numeric {0,1}
    le = LabelEncoder()
    y_all = pd.concat([pd.Series(y_train_raw), pd.Series(y_test_raw)], axis=0)
    y_all_enc = le.fit_transform(y_all)
    n_classes = len(le.classes_)
    y_train = y_all_enc[:len(y_train_raw)]
    y_test  = y_all_enc[len(y_train_raw):]

    es = callbacks.EarlyStopping(monitor="val_accuracy", patience=2, restore_best_weights=True)
    if arch.upper() == "CNN":
        model = build_cnn_model(seq_len, n_classes)
    else:
        model = build_lstm_model(seq_len, n_classes)
    model.fit(
        X_train, (y_train if n_classes>2 else y_train.astype(np.float32)),
        validation_split=0.1, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[es]
    )
    # predictions
    probs = model.predict(X_test, verbose=0)
    if n_classes > 2:
        y_pred_enc = np.argmax(probs, axis=1)
    else:
        y_pred_enc = (probs.flatten() >= 0.5).astype(int)

    # map back to original label values for reporting
    y_test_lbl = le.inverse_transform(y_test)
    y_pred_lbl = le.inverse_transform(y_pred_enc)

    labels_sorted = sorted(pd.unique(y_test_lbl))
    acc, prec, rec, f1, cm = compute_metrics(y_test_lbl, y_pred_lbl, labels_sorted)
    cm_path = os.path.join(RESULTS_DIR, f"{re.sub('[^A-Za-z0-9]+','_',dataset_name)}__{arch}__confusion_matrix.csv")
    save_cm(cm, labels_sorted, cm_path)

    return [{
        "dataset": dataset_name, "model": arch, "kind": "neural_tfidf",
        "n_train": len(y_train), "n_test": len(y_test),
        "classes": ";".join(map(str, labels_sorted)),
        "accuracy": round(acc,4), "precision_macro": round(prec,4),
        "recall_macro": round(rec,4), "f1_macro": round(f1,4),
        "confusion_matrix_path": cm_path
    }]

# ---------------------- Dual Contrastive Learning ----------------------
def stochastic_feature_dropout(X_dense, drop_prob=0.1):
    mask = (np.random.rand(*X_dense.shape) > drop_prob).astype(np.float32)
    return X_dense * mask

def supcon_loss_cosine(z1, z2, y, temperature=0.2):
    """
    Supervised contrastive loss across the batch using cosine similarities between z1 and z2 embeddings.
    Combines both views. y are integer class labels.
    """
    # normalize
    z1 = z1 / (np.linalg.norm(z1, axis=1, keepdims=True) + 1e-8)
    z2 = z2 / (np.linalg.norm(z2, axis=1, keepdims=True) + 1e-8)
    Z = np.concatenate([z1, z2], axis=0)  # [2B, d]
    Y = np.concatenate([y, y], axis=0)    # [2B]
    B = Z.shape[0]

    # cosine sim matrix
    sim = Z @ Z.T  # [2B, 2B]
    # remove self
    np.fill_diagonal(sim, -1e9)
    # scale
    sim /= temperature

    # mask for positives per anchor
    pos_mask = (Y.reshape(-1,1) == Y.reshape(1,-1)).astype(np.float32)
    np.fill_diagonal(pos_mask, 0.0)

    # log-softmax over rows
    sim_max = sim.max(axis=1, keepdims=True)
    exp_sim = np.exp(sim - sim_max)
    denom = exp_sim.sum(axis=1, keepdims=True)
    log_prob = sim - sim_max - np.log(denom + 1e-12)

    # average over positive pairs
    pos_counts = pos_mask.sum(axis=1, keepdims=True) + 1e-8
    loss_per_anchor = -(log_prob * pos_mask).sum(axis=1, keepdims=True) / pos_counts
    return loss_per_anchor.mean()

def run_dual_contrastive(X_text, y, dataset_name, epochs=10, batch_size=128, proj_dim=128, lr=1e-3, drop_prob=0.1):
    """
    Two TF-IDF views via stochastic feature dropout.
    Train a small projection head with supervised contrastive loss.
    Then train LR on frozen embeddings and evaluate.
    """
    if not TF_AVAILABLE:
        return [{
            "dataset": dataset_name, "model": "DualContrastive", "kind": "contrastive",
            "n_train": 0, "n_test": 0, "classes": "", "accuracy": np.nan,
            "precision_macro": np.nan, "recall_macro": np.nan, "f1_macro": np.nan,
            "confusion_matrix_path": "", "error": "TensorFlow not available"
        }]

    vec = TfidfVectorizer(**TFIDF_PARAMS_NN)
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(
        X_text, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y if pd.Series(y).nunique()>1 else None
    )
    X_train = vec.fit_transform(X_train_txt).astype(np.float32).toarray()
    X_test  = vec.transform(X_test_txt).astype(np.float32).toarray()

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc  = le.transform(y_test)
    n_classes = len(le.classes_)

    # Projection head (simple MLP in TF)
    d_in = X_train.shape[1]
    inp = layers.Input(shape=(d_in,))
    x = layers.Dense(256, activation="relu")(inp)
    x = layers.Dense(proj_dim, activation=None)(x)
    proj_model = models.Model(inp, x)
    opt = optimizers.Adam(lr)

    # Manual training loop for SupCon loss using NumPy for loss, TF for forward
    # to keep code compact and dependency-light
    @tf.function
    def forward(batch):
        return proj_model(batch, training=True)

    train_idx = np.arange(X_train.shape[0])
    steps = math.ceil(len(train_idx)/batch_size)
    for epoch in range(epochs):
        np.random.shuffle(train_idx)
        epoch_loss = []
        for step in range(steps):
            sl = step*batch_size
            sr = min((step+1)*batch_size, len(train_idx))
            idx = train_idx[sl:sr]
            xb = X_train[idx]
            yb = y_train_enc[idx]

            # two stochastic views
            v1 = stochastic_feature_dropout(xb, drop_prob=drop_prob)
            v2 = stochastic_feature_dropout(xb, drop_prob=drop_prob)

            with tf.GradientTape() as tape:
                z1 = forward(tf.convert_to_tensor(v1))
                z2 = forward(tf.convert_to_tensor(v2))
                # compute SupCon loss in numpy for simplicity
                z1_np = z1.numpy()
                z2_np = z2.numpy()
                loss_val = supcon_loss_cosine(z1_np, z2_np, yb, temperature=0.2)
                loss_tf = tf.convert_to_tensor(loss_val, dtype=tf.float32)

            grads = tape.gradient(loss_tf, proj_model.trainable_variables)
            opt.apply_gradients(zip(grads, proj_model.trainable_variables))
            epoch_loss.append(loss_val)

        # print(f"[DualCon] {dataset_name} epoch {epoch+1}/{epochs} loss={np.mean(epoch_loss):.4f}")

    # Freeze projection, compute embeddings
    Z_train = proj_model.predict(X_train, verbose=0)
    Z_test  = proj_model.predict(X_test, verbose=0)

    # Train LR head on embeddings
    head = LogisticRegression(max_iter=2000, class_weight="balanced", solver="liblinear")
    head.fit(Z_train, y_train_enc)
    y_pred_enc = head.predict(Z_test)
    y_pred = le.inverse_transform(y_pred_enc)

    labels_sorted = sorted(pd.unique(y_test))
    acc, prec, rec, f1, cm = compute_metrics(y_test, y_pred, labels_sorted)
    cm_path = os.path.join(RESULTS_DIR, f"{re.sub('[^A-Za-z0-9]+','_',dataset_name)}__DualContrastive__confusion_matrix.csv")
    save_cm(cm, labels_sorted, cm_path)

    return [{
        "dataset": dataset_name, "model": "DualContrastive", "kind": "contrastive",
        "n_train": len(y_train), "n_test": len(y_test),
        "classes": ";".join(map(str, labels_sorted)),
        "accuracy": round(acc,4), "precision_macro": round(prec,4),
        "recall_macro": round(rec,4), "f1_macro": round(f1,4),
        "confusion_matrix_path": cm_path
    }]

# ---------------------- Main loop ----------------------
def process_dataset(path):
    name = os.path.basename(path)
    if not os.path.exists(path):
        return [], {"dataset": name, "reason": "file not found"}

    # read CSV
    try:
        try: df = pd.read_csv(path)
        except Exception: df = pd.read_csv(path, encoding="latin-1")
    except Exception as e:
        return [], {"dataset": name, "reason": f"read error: {e}"}

    df = df.dropna(axis=1, how="all")
    if df.empty or df.shape[1] == 0:
        return [], {"dataset": name, "reason": "empty file"}

    # column selection
    if name in COLUMN_MAP:
        text_col = COLUMN_MAP[name]["text"]
        label_col = COLUMN_MAP[name]["label"]
        if text_col not in df.columns or label_col not in df.columns:
            tcols = list(df.columns)
            return [], {"dataset": name, "reason": f"column mapping not found in file", "columns": tcols}
    else:
        text_col, label_col = pick_text_and_label_columns(df)
    if text_col is None or label_col is None:
        return [], {"dataset": name, "reason": f"could not detect columns (text={text_col}, label={label_col})",
                    "columns": list(df.columns)}

    X_text = clean_text(df[text_col])
    y = normalize_binary_labels(df[label_col])

    mask = (X_text.str.len() > 0) & (~pd.Series(y).isna())
    X_text = X_text[mask]
    y = pd.Series(y)[mask]

    if pd.Series(y).nunique() < 2:
        return [], {"dataset": name, "reason": "only one class after cleaning", "label_col": label_col}

    results = []
    # Classic ML
    results += run_classic_ml(X_text, y, name)
    # CNN & LSTM
    results += run_nn_tfidf(X_text, y, name, arch="CNN", epochs=3, batch_size=64)
    results += run_nn_tfidf(X_text, y, name, arch="LSTM", epochs=3, batch_size=64)
    # Dual Contrastive
    results += run_dual_contrastive(X_text, y, name, epochs=10, batch_size=128, proj_dim=128, lr=1e-3, drop_prob=0.1)

    return results, None

def main():
    all_rows = []
    skipped = []
    for p in DATASETS:
        rows, skip = process_dataset(p)
        if rows:
            all_rows.extend(rows)
        if skip:
            skipped.append(skip)
    # save summary files
    summary = pd.DataFrame(all_rows)
    summary_path = os.path.join(RESULTS_DIR, "summary_metrics.csv")
    summary.to_csv(summary_path, index=False)

    skipped_df = pd.DataFrame(skipped)
    skipped_path = os.path.join(RESULTS_DIR, "skipped_datasets.csv")
    skipped_df.to_csv(skipped_path, index=False)

    print("Saved:", summary_path)
    print("Saved:", skipped_path)

if __name__ == "__main__":
    main()
