In [2]:
!pip install fasttext seaborn --quiet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import fasttext
import fasttext.util
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix,
    classification_report, roc_curve
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, Masking, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
fasttext.util.download_model('pl', if_exists='ignore')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz



'cc.pl.300.bin'

In [3]:
ft_model = fasttext.load_model('cc.pl.300.bin')

In [None]:
def get_sequence_ft_vectors(text, max_len=25):
    words = text_to_word_sequence(str(text), lower=False)
    vectors = [ft_model.get_word_vector(word) for word in words[:max_len]]
    return vectors

In [None]:
def prepare_sequence_data_from_single_file(data_path, max_len=25, test_size=0.2, random_state=42):
    df = pd.read_csv(data_path)
    X = [get_sequence_ft_vectors(text, max_len=max_len) for text in df['Text']]
    y = df['Class'].values

    X_pad = pad_sequences(X, maxlen=max_len, dtype='float32', padding='post', truncating='post')

    X_train, X_test, y_train, y_test = train_test_split(
        X_pad, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    print(f"Class distribution: {np.bincount(y)}")

    return X_train, y_train, X_test, y_test


In [None]:
def train_and_evaluate_bilstm(X_train, y_train, X_test, y_test,
                               title="LSTM",
                               max_len=25, embedding_dim=300,
                               lstm_units=128, batch_size=64, epochs=15,
                               save_dir="results/bilstm_model"):

    os.makedirs(save_dir, exist_ok=True)

    model = Sequential()
    model.add(Input(shape=(max_len, embedding_dim)))
    model.add(Masking(mask_value=0.0))
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall', 'AUC'])

    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1
    )

    y_proba = model.predict(X_test).ravel()
    y_pred = (y_proba > 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"\n📌 {title}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

    model_path = os.path.join(save_dir, "bilstm_model.h5")
    model.save(model_path)
    print(f"✅ Model saved to: {model_path}")

    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
    plt.title("Confusion Matrix for LSTM - The Best Variant")
    plt.xlabel("Prediction")
    plt.ylabel("Actual")
    plt.savefig(os.path.join(save_dir, "confusion_matrix.png"))
    plt.close()

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve for LSTM - The Best Varian")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(save_dir, "roc_curve.png"))
    plt.close()

In [None]:
def run_pipeline_bilstm_from_single_file(data_path, max_len=25):
    X_train, y_train, X_test, y_test = prepare_sequence_data_from_single_file(data_path, max_len=max_len)
    train_and_evaluate_bilstm(X_train, y_train, X_test, y_test)

In [8]:
run_pipeline_bilstm_from_single_file("BAN-PL_raw.csv", max_len=50)

Train shape: (19199, 50, 300), Test shape: (4800, 50, 300)
Class distribution: [12000 11999]
Epoch 1/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 303ms/step - AUC: 0.7819 - Precision: 0.7310 - Recall: 0.6441 - accuracy: 0.7121 - loss: 0.5582 - val_AUC: 0.8947 - val_Precision: 0.8524 - val_Recall: 0.7674 - val_accuracy: 0.8138 - val_loss: 0.4179
Epoch 2/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 323ms/step - AUC: 0.8952 - Precision: 0.8301 - Recall: 0.7911 - accuracy: 0.8182 - loss: 0.4102 - val_AUC: 0.9073 - val_Precision: 0.8706 - val_Recall: 0.7531 - val_accuracy: 0.8172 - val_loss: 0.4026
Epoch 3/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 315ms/step - AUC: 0.9121 - Precision: 0.8438 - Recall: 0.8298 - accuracy: 0.8394 - loss: 0.3785 - val_AUC: 0.9193 - val_Precision: 0.8310 - val_Recall: 0.8696 - val_accuracy: 0.8435 - val_loss: 0.3676
Epoch 4/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m




📌 LSTM
Accuracy:  0.8529
Precision: 0.8562
Recall:    0.8483
F1-score:  0.8522
ROC AUC:   0.9295

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85      2400
           1       0.86      0.85      0.85      2400

    accuracy                           0.85      4800
   macro avg       0.85      0.85      0.85      4800
weighted avg       0.85      0.85      0.85      4800

✅ Model saved to: results/bilstm_model/bilstm_model.h5


In [9]:
run_pipeline_bilstm_from_single_file("BAN-PL_light.csv", max_len=50)

Train shape: (19178, 50, 300), Test shape: (4795, 50, 300)
Class distribution: [11994 11979]
Epoch 1/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 336ms/step - AUC: 0.7821 - Precision: 0.7315 - Recall: 0.6629 - accuracy: 0.7131 - loss: 0.5654 - val_AUC: 0.8794 - val_Precision: 0.8176 - val_Recall: 0.7733 - val_accuracy: 0.7967 - val_loss: 0.4402
Epoch 2/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 307ms/step - AUC: 0.8853 - Precision: 0.8104 - Recall: 0.7928 - accuracy: 0.8065 - loss: 0.4292 - val_AUC: 0.8918 - val_Precision: 0.8332 - val_Recall: 0.7794 - val_accuracy: 0.8081 - val_loss: 0.4175
Epoch 3/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 321ms/step - AUC: 0.9015 - Precision: 0.8282 - Recall: 0.8165 - accuracy: 0.8264 - loss: 0.3999 - val_AUC: 0.9049 - val_Precision: 0.8365 - val_Recall: 0.8193 - val_accuracy: 0.8264 - val_loss: 0.3906
Epoch 4/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m




📌 LSTM
Accuracy:  0.8359
Precision: 0.8219
Recall:    0.8573
F1-score:  0.8392
ROC AUC:   0.9113

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83      2399
           1       0.82      0.86      0.84      2396

    accuracy                           0.84      4795
   macro avg       0.84      0.84      0.84      4795
weighted avg       0.84      0.84      0.84      4795

✅ Model saved to: results/bilstm_model/bilstm_model.h5


In [10]:
run_pipeline_bilstm_from_single_file("BAN-PL_full.csv", max_len=50)

Train shape: (19138, 50, 300), Test shape: (4785, 50, 300)
Class distribution: [11955 11968]
Epoch 1/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 330ms/step - AUC: 0.8018 - Precision: 0.7322 - Recall: 0.7217 - accuracy: 0.7302 - loss: 0.5515 - val_AUC: 0.8790 - val_Precision: 0.8531 - val_Recall: 0.7647 - val_accuracy: 0.8130 - val_loss: 0.4681
Epoch 2/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 309ms/step - AUC: 0.8882 - Precision: 0.8183 - Recall: 0.8103 - accuracy: 0.8161 - loss: 0.4270 - val_AUC: 0.8982 - val_Precision: 0.8719 - val_Recall: 0.7504 - val_accuracy: 0.8166 - val_loss: 0.4126
Epoch 3/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 327ms/step - AUC: 0.9009 - Precision: 0.8378 - Recall: 0.8003 - accuracy: 0.8236 - loss: 0.4012 - val_AUC: 0.9060 - val_Precision: 0.8082 - val_Recall: 0.8637 - val_accuracy: 0.8260 - val_loss: 0.3963
Epoch 4/15
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m




📌 LSTM
Accuracy:  0.8387
Precision: 0.8463
Recall:    0.8279
F1-score:  0.8370
ROC AUC:   0.9151

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84      2391
           1       0.85      0.83      0.84      2394

    accuracy                           0.84      4785
   macro avg       0.84      0.84      0.84      4785
weighted avg       0.84      0.84      0.84      4785

✅ Model saved to: results/bilstm_model/bilstm_model.h5
