In [None]:
# ==============================
# LSTM-Only Advanced Deep Learning Pipeline with Preprocessing
# ==============================
import pandas as pd
import numpy as np
import re
import emoji
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Fix randomness
tf.random.set_seed(42)
np.random.seed(42)


In [None]:

# ==============================
# Preprocessing Functions
# ==============================
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove emojis
    text = emoji.replace_emoji(text, replace="")

    # Remove non-alphabetical characters (keep .,!? for sentence structure)
    text = re.sub(r"[^a-zA-Z\s.,!?]", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

def preprocess_dataframe(df, text_col, remove_value='"'):
    # Drop rows with unwanted text (like just a " )
    df = df[df[text_col].astype(str).str.strip() != remove_value]

    # Apply cleaning
    df[text_col] = df[text_col].apply(clean_text)

    # Drop empty rows after cleaning
    df = df[df[text_col].str.strip() != ""]

    return df.reset_index(drop=True)


In [None]:

# ==============================
# TextClassifier Class
# ==============================
class TextClassifier:
    def __init__(self, max_features=20000, max_length=200):
        self.max_features = max_features
        self.max_length = max_length
        self.tokenizer = None
        self.models = {}
        self.histories = {}

    def handle_imbalanced_data(self, X_train, y_train, method='balanced_weights'):
        print(f"Original distribution: 0={sum(y_train==0)}, 1={sum(y_train==1)}")
        if method == 'balanced_weights':
            pos, neg = sum(y_train==1), sum(y_train==0)
            weight_ratio = np.sqrt(neg/pos)
            class_weight = {0: 1.0, 1: min(weight_ratio, 5.0)}
            print(f"Using class weights: {class_weight}")
            return X_train, y_train, class_weight
        else:
            return X_train, y_train, None

    def prepare_sequences(self, X_train, X_val, X_test):
        self.tokenizer = Tokenizer(num_words=self.max_features, oov_token="<OOV>")
        self.tokenizer.fit_on_texts(X_train)

        def pad(X): 
            return pad_sequences(self.tokenizer.texts_to_sequences(X),
                                 maxlen=self.max_length, padding='post')

        return pad(X_train), pad(X_val), pad(X_test)

    def create_improved_lstm_model(self, embedding_dim=128):
        model = Sequential([
            Embedding(self.max_features, embedding_dim, input_length=self.max_length),
            Dropout(0.2),
            LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
            BatchNormalization(),
            LSTM(32, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        return model

    def focal_loss(self, gamma=2., alpha=0.25):
        def focal_loss_fixed(y_true, y_pred):
            eps = tf.keras.backend.epsilon()
            y_pred = tf.clip_by_value(y_pred, eps, 1. - eps)
            p_t = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
            alpha_t = tf.where(tf.equal(y_true, 1), alpha, 1 - alpha)
            return tf.reduce_mean(-alpha_t * tf.pow(1 - p_t, gamma) * tf.math.log(p_t))
        return focal_loss_fixed

    def train_model(self, model, X_train, X_val, y_train, y_val,
                    model_name, class_weight=None, use_focal_loss=False, epochs=10):
        print(f"\nTraining {model_name}...")
        loss_fn = self.focal_loss() if use_focal_loss else "binary_crossentropy"
        if use_focal_loss: print("Using Focal Loss")

        model.compile(
            optimizer=Adam(learning_rate=5e-4),
            loss=loss_fn,
            metrics=["accuracy", tf.keras.metrics.Precision(name="precision"),
                     tf.keras.metrics.Recall(name="recall")]
        )

        callbacks = [
            EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
            ReduceLROnPlateau(monitor="val_loss", patience=3, factor=0.5, min_lr=1e-7),
            ModelCheckpoint(f"{model_name.lower()}_best.keras",
                            save_best_only=True, monitor="val_loss", mode="min")
        ]

        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=32 if class_weight else 64,
            class_weight=class_weight,
            callbacks=callbacks,
            verbose=1
        )
        return model, history

    def train_lstm_models(self, X_train, X_val, X_test, y_train, y_val, y_test,
                          balance_method="balanced_weights"):
        X_train_bal, y_train_bal, class_weight = self.handle_imbalanced_data(X_train, y_train, balance_method)
        X_train_pad, X_val_pad, X_test_pad = self.prepare_sequences(X_train_bal, X_val, X_test)

        configs = {"LSTM_Focal": (self.create_improved_lstm_model, True)}

        results = []
        for name, (fn, focal) in configs.items():
            model = fn()
            model.build(input_shape=(None, self.max_length))
            model.summary()
            trained, hist = self.train_model(model, X_train_pad, X_val_pad, y_train_bal, y_val,
                                             name, class_weight if not focal else None, focal)
            self.models[name], self.histories[name] = trained, hist

            probs = trained.predict(X_test_pad).flatten()
            preds = (probs > 0.5).astype(int)
            res = {"Model": name, "Accuracy": None, "F1": None, "ROC_AUC": None}
            if y_test is not None:
                res["Accuracy"] = (preds == y_test).mean()
                res["F1"] = f1_score(y_test, preds)
                res["ROC_AUC"] = roc_auc_score(y_test, probs)
            results.append(res)

        return pd.DataFrame(results), X_test_pad

In [None]:


# ==============================
# Main Execution
# ==============================
if __name__ == "__main__":
    # Load data
    train_df = pd.read_csv(r"Dataset\train.csv")
    test_df = pd.read_csv(r"Dataset\test.csv")

   
    TEXT_COL = "comment_text"    
    TARGET_COL = "psychotic_depression"   
    ID_COL = "id" if "id" in test_df.columns else None

    # Preprocess datasets
    train_df = preprocess_dataframe(train_df, TEXT_COL, remove_value='"')
    test_df = preprocess_dataframe(test_df, TEXT_COL, remove_value='"')

    print("Train shape after cleaning:", train_df.shape)
    print("Test shape after cleaning:", test_df.shape)

    # Prepare X, y
    X = train_df[TEXT_COL].astype(str).values
    y = train_df[TARGET_COL].values

    if y.dtype == 'O':
        le = LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    X_test = test_df[TEXT_COL].astype(str).values
    y_test = test_df[TARGET_COL].values if TARGET_COL in test_df else None

    clf = TextClassifier(max_features=20000, max_length=200)

    results, X_test_pad = clf.train_lstm_models(
        X_train, X_val, X_test, y_train, y_val, y_test,
        balance_method="balanced_weights"
    )

    print("\n=== Results ===")
    print(results)

    # Save predictions from best model
    best_model_name = results.sort_values("F1" if "F1" in results else "ROC_AUC",
                                          ascending=False).iloc[0]["Model"]
    best_model = clf.models[best_model_name]

    test_probs = best_model.predict(X_test_pad).flatten()
    test_preds = (test_probs > 0.5).astype(int)

    output = pd.DataFrame({
        ID_COL if ID_COL else "index": test_df[ID_COL] if ID_COL else range(len(test_df)),
        "prediction": test_preds
    })

    output.to_csv("output.csv", index=False)

Train shape after cleaning: (159552, 3)
Test shape after cleaning: (152781, 2)
Original distribution: 0=115406, 1=12235
Using class weights: {0: 1.0, 1: 3.0712290529008253}





Training LSTM_Focal...
Using Focal Loss
Epoch 1/10
[1m1995/1995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m880s[0m 435ms/step - accuracy: 0.9022 - loss: 0.0343 - precision: 0.1003 - recall: 0.0025 - val_accuracy: 0.9041 - val_loss: 0.0301 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 5.0000e-04
Epoch 2/10
[1m1995/1995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1140s[0m 544ms/step - accuracy: 0.9041 - loss: 0.0309 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.9041 - val_loss: 0.0299 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 5.0000e-04
Epoch 3/10
[1m1995/1995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m965s[0m 484ms/step - accuracy: 0.9047 - loss: 0.0301 - precision: 0.8350 - recall: 0.0070 - val_accuracy: 0.9058 - val_loss: 0.0296 - val_precision: 0.8462 - val_recall: 0.0216 - learning_rate: 5.0000e-04
Epoch 4/10
[1m1995/1995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m798s[0m 400ms/step - accura