In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE



In [3]:
df  = pd.read_csv("data/preprocessing_df.csv")
df = df.dropna(subset=['text'])

X = df.text
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=114, stratify=y)

tfidf = TfidfVectorizer(ngram_range=(1,3),max_features=8000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

smote = SMOTE(random_state=42)
X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)


lightgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.1,  # L2 regularization,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20
)

logreg_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='multinomial')

# Meta-learner
knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

# Create the StackingClassifier with LightGBM and LogisticRegression as base models, and KNN as meta-learner
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model)
    ],
    final_estimator=knn_meta_learner,
    cv=5
)

# Train the stacking model
stacking_model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = stacking_model.predict(X_test_tfidf)

# Generate classification report
print(classification_report(y_test, y_pred))



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.045350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 591020
[LightGBM] [Info] Number of data points in the train set: 211167, number of used features: 7931
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.534162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525932
[LightGBM] [Info] Number of data points in the train set: 168933, number of used features: 7918
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.680950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 526236
[LightGBM] [Info] Number of data points in the train set: 168933, number of used features: 7917
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.665712 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525714
[LightGBM] [Info] Number of data points in the train set: 168934, number of used features: 7920
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.867843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 524534
[LightGBM] [Info] Number of data points in the train set: 168934, number of used features: 7921
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.747391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 523193
[LightGBM] [Info] Number of data points in the train set: 168934, number of used features: 7921
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




              precision    recall  f1-score   support

          -1       0.81      0.83      0.82      8751
           0       0.88      0.94      0.91     13543
           1       0.93      0.87      0.90     17597

    accuracy                           0.89     39891
   macro avg       0.87      0.88      0.88     39891
weighted avg       0.89      0.89      0.89     39891

