In [2]:
# loading the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, average_precision_score, auc, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [4]:
datasets = ['train', 'val', 'test']
loaded_data = {}
for dataset in datasets:
    loaded_data[f'{dataset}'] = pd.read_csv(f'E:/Coding/Applied Machine Learning/02-experimental-tracking/data/prepared/{dataset}.csv')

# Accessing the loaded datasets:
train_X = loaded_data['train'].text
train_y = loaded_data['train'].spam
val_X = loaded_data['val'].text
val_y = loaded_data['val'].spam
test_X = loaded_data['test'].text
test_y = loaded_data['test'].spam

In [13]:
def train_models(train_X, train_y):
    # XGBoost model
    lgb_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LGBMClassifier(random_state=1, force_row_wise=True))
    ])
    lgb_pipeline.fit(train_X, train_y)

    # Logistic Regression model
    lr_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LogisticRegression(random_state=1))
    ])
    lr_pipeline.fit(train_X, train_y)

    # Multinomial Naive Bayes model
    nb_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', MultinomialNB())
    ])
    nb_pipeline.fit(train_X, train_y)

    return lgb_pipeline, lr_pipeline, nb_pipeline

In [15]:
lgb_pipeline, lr_pipeline, nb_pipeline = train_models(train_X, train_y)

[LightGBM] [Info] Number of positive: 804, number of negative: 2632
[LightGBM] [Info] Total Bins 76314
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 2519
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233993 -> initscore=-1.185900
[LightGBM] [Info] Start training from score -1.185900


In [16]:
# function for evaluating the models
def AUCPR_models(pipelines, test_X, test_y):
    results = {}
    for name, pipeline in pipelines.items():
        predictions = pipeline.predict(test_X)
        precision, recall, _ = precision_recall_curve(test_y, predictions)
        auc_precision_recall = auc(recall, precision)
        print(f'{name} AUCPR: {auc_precision_recall:.4f}')
    return results

In [17]:
# Storing the pipelines in a dictionary for easy reference
pipelines = {
    'Light GBM': lgb_pipeline,
    'Logistic Regression': lr_pipeline,
    'Multinomial Naive Bayes': nb_pipeline
}

# Evaluate models on test data
evaluation_results = AUCPR_models(pipelines, test_X, test_y)

Light GBM AUCPR: 0.9717, aucpr: 0.9443996420691166
Logistic Regression AUCPR: 0.9695, aucpr: 0.9394915513607727
Multinomial Naive Bayes AUCPR: 0.9398, aucpr: 0.8796925476216999


In [18]:
# Define the base models
models_gcv = {
    'lightgbm': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LGBMClassifier(random_state=1, force_row_wise=True))
    ]),

    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LogisticRegression(random_state=1))
    ]),

    'naive_bayes': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', MultinomialNB())
    ])
}

# Define the parameter grids for each model
param_grids = {
    'lightgbm': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__learning_rate': [0.01, 0.1, 0.5],
        'clf__num_leaves': [15,31,63],
        'clf__max_depth': [6, 8]
    },

    'logistic_regression': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    },
    
    'naive_bayes': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__alpha': [0.01, 0.1, 1]
    }
}

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
def fit_models_with_grid_search(models, param_grids, X_train, y_train):
    fitted_models = {}
    
    for name, model in models.items():
        print(f"Fitting {name}...")
        grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        fitted_models[name] = grid_search
        
    return fitted_models

In [21]:
# Call the function to fit the models
fitted_models = fit_models_with_grid_search(models_gcv, param_grids, train_X, train_y)

Fitting lightgbm...
[LightGBM] [Info] Number of positive: 804, number of negative: 2632
[LightGBM] [Info] Total Bins 91922
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 3415
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.233993 -> initscore=-1.185900
[LightGBM] [Info] Start training from score -1.185900
Fitting logistic_regression...
Fitting naive_bayes...


In [22]:
# Now you can access the best models with best hyperparameters by accessing fitted_models dictionary
best_lightgbm_model = fitted_models['lightgbm'].best_estimator_
best_logistic_regression_model = fitted_models['logistic_regression'].best_estimator_
best_naive_bayes_model = fitted_models['naive_bayes'].best_estimator_

In [23]:
def AUCPR_models(fitted_models, test_X, test_y):
    results = {}
    for name, grid_search_cv in fitted_models.items():
        best_estimator = grid_search_cv.best_estimator_
        predictions = best_estimator.predict(test_X)
        precision, recall, _ = precision_recall_curve(test_y, predictions)
        auc_precision_recall = auc(recall, precision)
        results[name] = auc_precision_recall
        print(f'{name} AUCPR: {auc_precision_recall:.4f}')
    return results

In [24]:
# Call the function to evaluate the best models
AUCPR_results = AUCPR_models(fitted_models, test_X, test_y)

lightgbm AUCPR: 0.9707
logistic_regression AUCPR: 0.9906
naive_bayes AUCPR: 0.9794
