In [1]:
!pip install fasttext seaborn --quiet

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import fasttext
import fasttext.util
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve
)
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, Masking, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


In [None]:
fasttext.util.download_model('pl', if_exists='ignore')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz



'cc.pl.300.bin'

In [3]:
ft_model = fasttext.load_model('cc.pl.300.bin')

In [4]:
def get_sequence_ft_vectors(text, max_len=25):
    words = text_to_word_sequence(str(text), lower=False)
    vectors = [ft_model.get_word_vector(word) for word in words[:max_len]]
    return vectors

In [5]:
def prepare_sequence_data(train_path, test_path, max_len=25):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    X_train_seq = [get_sequence_ft_vectors(text, max_len=max_len) for text in train_df['text']]
    X_test_seq = [get_sequence_ft_vectors(text, max_len=max_len) for text in test_df['text']]

    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, dtype='float32', padding='post', truncating='post')
    X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, dtype='float32', padding='post', truncating='post')

    y_train = train_df['label'].values
    y_test = test_df['label'].values

    print(f"Train shape: {X_train_pad.shape}, Test shape: {X_test_pad.shape}")
    print(f"Class distribution in train: {np.bincount(y_train)}")

    return X_train_pad, y_train, X_test_pad, y_test

In [None]:
def train_and_evaluate_bilstm(X_train, y_train, X_test, y_test,
                               title="LSTM",
                               max_len=25, embedding_dim=300,
                               lstm_units=64, batch_size=32, epochs=15,
                               save_dir="results/bilstm_model"):

    os.makedirs(save_dir, exist_ok=True)

    model = Sequential()
    model.add(Input(shape=(max_len, embedding_dim)))
    model.add(Masking(mask_value=0.0))
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall', 'AUC'])

    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights = dict(enumerate(class_weights))

    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1,
        class_weight=class_weights
    )

    y_proba = model.predict(X_test).ravel()
    y_pred = (y_proba > 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"\n📌 {title}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

    model_path = os.path.join(save_dir, "bilstm_model.h5")
    model.save(model_path)
    print(f"✅ Model saved to: {model_path}")

    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
    plt.title("Confusion Matrix for LSTM - The Best Variant")
    plt.xlabel("Prediction")
    plt.ylabel("Actual")
    cm_path = os.path.join(save_dir, "confusion_matrix.png")
    plt.savefig(cm_path)
    plt.close()
    print(f"🖼️ Confusion matrix saved to: {cm_path}")

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve for LSTM - The Best Varian")
    plt.legend()
    plt.grid(True)
    roc_path = os.path.join(save_dir, "roc_curve.png")
    plt.savefig(roc_path)
    plt.close()
    print(f"🖼️ ROC curve saved to: {roc_path}")

In [None]:
def run_pipeline_bilstm(train_path, test_path, max_len=25):
    X_train, y_train, X_test, y_test = prepare_sequence_data(train_path, test_path, max_len=max_len)
    train_and_evaluate_bilstm(X_train, y_train, X_test, y_test)

In [19]:
run_pipeline_bilstm("v1_training_variant1_raw.csv", "v1_test_variant1_raw.csv")

Train shape: (10041, 20, 300), Test shape: (1000, 20, 300)
Class distribution in train: [9190  851]
Epoch 1/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 60ms/step - AUC: 0.7093 - Precision: 0.1461 - Recall: 0.5545 - accuracy: 0.7247 - loss: 0.5512 - val_AUC: 0.8340 - val_Precision: 0.6182 - val_Recall: 0.5320 - val_accuracy: 0.8636 - val_loss: 0.3679
Epoch 2/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 60ms/step - AUC: 0.8993 - Precision: 0.2791 - Recall: 0.7973 - accuracy: 0.8501 - loss: 0.3616 - val_AUC: 0.8569 - val_Precision: 0.5477 - val_Recall: 0.6337 - val_accuracy: 0.8477 - val_loss: 0.3702
Epoch 3/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 57ms/step - AUC: 0.9172 - Precision: 0.3014 - Recall: 0.8148 - accuracy: 0.8659 - loss: 0.3186 - val_AUC: 0.8321 - val_Precision: 0.6496 - val_Recall: 0.4797 - val_accuracy: 0.8666 - val_loss: 0.3490
Epoch 4/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 




📌 LSTM
Accuracy:  0.9010
Precision: 0.6357
Recall:    0.6119
F1-score:  0.6236
ROC AUC:   0.9023

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       866
           1       0.64      0.61      0.62       134

    accuracy                           0.90      1000
   macro avg       0.79      0.78      0.78      1000
weighted avg       0.90      0.90      0.90      1000

✅ Model saved to: results/bilstm_model/bilstm_model.h5
🖼️ Confusion matrix saved to: results/bilstm_model/confusion_matrix.png
🖼️ ROC curve saved to: results/bilstm_model/roc_curve.png


In [25]:
run_pipeline_bilstm("v1_training_variant2_light.csv", "v1_test_variant2_light.csv")

Train shape: (10041, 20, 300), Test shape: (1000, 20, 300)
Class distribution in train: [9190  851]
Epoch 1/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 66ms/step - AUC: 0.6651 - Precision: 0.1467 - Recall: 0.2884 - accuracy: 0.8599 - loss: 0.5467 - val_AUC: 0.7981 - val_Precision: 0.5657 - val_Recall: 0.4884 - val_accuracy: 0.8482 - val_loss: 0.3919
Epoch 2/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 54ms/step - AUC: 0.8865 - Precision: 0.2714 - Recall: 0.7803 - accuracy: 0.8531 - loss: 0.3757 - val_AUC: 0.8399 - val_Precision: 0.4944 - val_Recall: 0.6424 - val_accuracy: 0.8263 - val_loss: 0.4122
Epoch 3/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 54ms/step - AUC: 0.9250 - Precision: 0.3071 - Recall: 0.8490 - accuracy: 0.8589 - loss: 0.3205 - val_AUC: 0.8235 - val_Precision: 0.5173 - val_Recall: 0.5640 - val_accuracy: 0.8352 - val_loss: 0.3835
Epoch 4/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 




📌 LSTM
Accuracy:  0.8870
Precision: 0.5868
Recall:    0.5299
F1-score:  0.5569
ROC AUC:   0.8947

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94       866
           1       0.59      0.53      0.56       134

    accuracy                           0.89      1000
   macro avg       0.76      0.74      0.75      1000
weighted avg       0.88      0.89      0.88      1000

✅ Model saved to: results/bilstm_model/bilstm_model.h5
🖼️ Confusion matrix saved to: results/bilstm_model/confusion_matrix.png
🖼️ ROC curve saved to: results/bilstm_model/roc_curve.png


In [29]:
run_pipeline_bilstm("v1_training_variant3_full.csv", "v1_test_variant3_full.csv")

Train shape: (10008, 20, 300), Test shape: (999, 20, 300)
Class distribution in train: [9157  851]
Epoch 1/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 67ms/step - AUC: 0.7138 - Precision: 0.1767 - Recall: 0.3825 - accuracy: 0.8523 - loss: 0.5466 - val_AUC: 0.8014 - val_Precision: 0.4708 - val_Recall: 0.6337 - val_accuracy: 0.8147 - val_loss: 0.4584
Epoch 2/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 62ms/step - AUC: 0.8667 - Precision: 0.2150 - Recall: 0.7755 - accuracy: 0.8124 - loss: 0.4058 - val_AUC: 0.8153 - val_Precision: 0.5640 - val_Recall: 0.5378 - val_accuracy: 0.8492 - val_loss: 0.3924
Epoch 3/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 67ms/step - AUC: 0.8896 - Precision: 0.2662 - Recall: 0.7862 - accuracy: 0.8405 - loss: 0.3860 - val_AUC: 0.8188 - val_Precision: 0.6185 - val_Recall: 0.4855 - val_accuracy: 0.8601 - val_loss: 0.3703
Epoch 4/15
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 




📌 LSTM
Accuracy:  0.8909
Precision: 0.6050
Recall:    0.5373
F1-score:  0.5692
ROC AUC:   0.8566

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       865
           1       0.61      0.54      0.57       134

    accuracy                           0.89       999
   macro avg       0.77      0.74      0.75       999
weighted avg       0.89      0.89      0.89       999

✅ Model saved to: results/bilstm_model/bilstm_model.h5
🖼️ Confusion matrix saved to: results/bilstm_model/confusion_matrix.png
🖼️ ROC curve saved to: results/bilstm_model/roc_curve.png
