In [2]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier  
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import f1_score
from joblib import dump
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import time

In [None]:

# Load the Tagalog stop words
with open('tagalog_stop_words.txt', 'r', encoding='utf-8') as file:
    stop_words = [line.strip() for line in file]

# Load the Tagalog dataset
df = pd.read_csv('Dataset/full.csv', encoding='utf-8')

# Assuming your dataset has 'text' and 'label' columns
X_train, X_test, y_train, y_test = train_test_split(df['article'], df['label'], test_size=0.2, random_state=42)


# Define classifiers to compare, including AdaBoostClassifier
random_state = 42  # Set a fixed random state for reproducibility

classifiers = [
    MultinomialNB(),
    LogisticRegression(random_state=random_state),
    RandomForestClassifier(random_state=random_state),
    DecisionTreeClassifier(random_state=random_state),
    GradientBoostingClassifier(random_state=random_state),
    AdaBoostClassifier(random_state=random_state),
    SVC(random_state=random_state),
    KNeighborsClassifier(),
    SGDClassifier(random_state=random_state),
    MLPClassifier(random_state=random_state)
]
test_results_list = []
train_accuracy_list = []  # List to store training accuracy values
models = []
joblib_path = r'C:\Users\LEGION\UMS Group - GBIC Dropbox\Jemar Laag\ACADS\AI 201\Fake News Detection\No Hyperparameterer tuning/'
train_results_list = []
for classifier in classifiers:
    print(classifier)
    
    # Hyperparameter grids for TfidfVectorizer and classifiers
    param_grids = {
        'tfidf__max_df': [0.9],  # Example max_df values
    }

    classifier_name = type(classifier).__name__.lower()

    # Set the classifier in the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words,lowercase=True, token_pattern=r'\b\w+\b')),
        ('classifier', classifier)
    ])
    start_time = time.time()
    # Perform hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grids, scoring='accuracy', cv=5, n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    end_time = time.time()  # Record the end time
    total_training_time = end_time - start_time 
    # Save the best model using joblib
    best_model = grid_search.best_estimator_
    model_filename = f"{joblib_path}{type(classifier).__name__}_best_model.joblib"
    dump(best_model, model_filename)

    # Calculate training accuracy
    train_accuracy_score = grid_search.best_score_
    train_accuracy_list.append({
        'Classifier': type(classifier).__name__,
        'Train Accuracy': train_accuracy_score,
        'Total Training Time (s)': total_training_time
    })

    # Make predictions on the test set
    predictions = grid_search.predict(X_test)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = confusion.ravel()

    # Calculate metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    specificity = tn / (tn + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    # Append results to the list
    test_results_list.append({
        'Classifier': type(classifier).__name__,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        'Specificity': specificity,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn
    })
    
    train_predictions = grid_search.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    train_confusion = confusion_matrix(y_train, train_predictions)
    train_tn, train_fp, train_fn, train_tp = train_confusion.ravel()

    # Calculate metrics
    train_recall = train_tp / (train_tp + train_fn)
    train_precision = train_tp / (train_tp + train_fp)
    train_specificity = train_tn / (train_tn + train_fp)
    train_f1 = 2 * (train_precision * train_recall) / (train_precision + train_recall)
    # Append results to the list
    train_results_list.append({
        'Classifier': type(classifier).__name__,
        'Accuracy': train_accuracy,
        'Recall':train_recall,
        'Precision': train_precision,
        'F1 Score': train_f1,
        'Specificity': train_specificity,
        'TP': train_tp,
        'TN': train_tn,
        'FP': train_fp,
        'FN': train_fn
    })
    
    
    print(f"Test Accuracy for {type(classifier).__name__}: {accuracy}")
    print(f"Training Accuracy Score for {type(classifier).__name__}: {train_accuracy_score}")
    print(f"Training Accuracy all training data for {type(classifier).__name__}: {train_accuracy}")
    print("\n")

# Convert the lists of dictionaries to DataFrames
test_results_df = pd.DataFrame(test_results_list)
train_accuracy_df = pd.DataFrame(train_accuracy_list)
train_accuracy_df_all  = pd.DataFrame(train_results_list)

MultinomialNB()
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Test Accuracy for MultinomialNB: 0.8598130841121495
Training Accuracy Score for MultinomialNB: 0.8685664900097466
Training Accuracy all training data for MultinomialNB: 0.9602184087363494


LogisticRegression(random_state=42)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Test Accuracy for LogisticRegression: 0.9485981308411215
Training Accuracy Score for LogisticRegression: 0.9356420565302145
Training Accuracy all training data for LogisticRegression: 0.9781591263650546


RandomForestClassifier(random_state=42)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Test Accuracy for RandomForestClassifier: 0.940809968847352
Training Accuracy Score for RandomForestClassifier: 0.9411024305555555
Training Accuracy all training data for RandomForestClassifier: 1.0


DecisionTreeClassifier(random_state=42)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Test Accuracy for DecisionTreeClassi

In [4]:
test_results_df.to_csv('TEST_ACCURACY_DEFINED_HYPERPARAMETER.csv')
train_accuracy_df.to_csv('TRAIN_ACCURACY_DEFINED_HYPERPARAMETER.csv')
train_accuracy_df_all.to_csv('TRAIN_ACCURACY_OVERALL_TRAIN_DATA_DEFINED_HYPERPARAMETER.csv')

In [5]:

test_results_list_tuned = []
train_accuracy_list_tuned= []  # List to store training accuracy values
models = []
joblib_path = r'C:\Users\LEGION\UMS Group - GBIC Dropbox\Jemar Laag\ACADS\AI 201\Fake News Detection\With Hyperparameter tuning/'
train_results_list_tuned = []
for classifier in classifiers:
    print(classifier)
    
    # Hyperparameter grids for TfidfVectorizer and classifiers
    param_grids = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Example ngram_range values
        'tfidf__max_df': [ 0.7, 0.8, 0.9],  # Example max_df values
    }

    classifier_name = type(classifier).__name__.lower()
    if isinstance(classifier, MultinomialNB):
        param_grids[f'classifier__alpha'] = [0.1, 0.5, 1.0]
        
    elif isinstance(classifier, LogisticRegression):
        param_grids[f'classifier__C'] = [0.1, 1.0, 10.0]
    elif isinstance(classifier, RandomForestClassifier):
        param_grids[f'classifier__n_estimators'] = [50, 100, 200]
    elif isinstance(classifier, MLPClassifier):
        #param_grids[f'classifier__hidden_layer_sizes'] = [(50,)]
        param_grids[f'classifier__alpha'] = [0.0001, 0.001]
        #param_grids[f'classifier__activation'] = ['relu']
        param_grids[f'classifier__learning_rate_init'] = [0.001, 0.1]
    elif isinstance(classifier, AdaBoostClassifier):
        param_grids[f'classifier__n_estimators'] = [50, 100, 200]
        param_grids[f'classifier__learning_rate'] = [0.01, 0.1, 1.0]
    elif isinstance(classifier, SVC):
        param_grids[f'classifier__C'] = [0.1, 1.0, 10.0]
        param_grids[f'classifier__kernel'] = ['linear', 'rbf']
    elif isinstance(classifier, KNeighborsClassifier):
        param_grids[f'classifier__n_neighbors'] = [3, 5, 7]
        param_grids[f'classifier__weights'] = ['uniform', 'distance']
    elif isinstance(classifier, SGDClassifier):
        param_grids[f'classifier__alpha'] = [0.0001, 0.001]
        param_grids[f'classifier__penalty'] = ['l2', 'l1', 'elasticnet']
    elif isinstance(classifier, DecisionTreeClassifier):
        param_grids[f'classifier__max_depth'] = [None, 10,20]
        param_grids[f'classifier__min_samples_split'] = [2, 5]
        param_grids[f'classifier__min_samples_leaf'] = [1, 2]

    # Add hyperparameters for GradientBoostingClassifier
    elif isinstance(classifier, GradientBoostingClassifier):
        param_grids[f'classifier__n_estimators'] = [50, 100]  # Reduced number of values
        param_grids[f'classifier__learning_rate'] = [0.01, 0.1]  # Reduced number of values
        param_grids[f'classifier__max_depth'] = [3, 5]  # Reduced number of values

    # Set the classifier in the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words,lowercase=True, token_pattern=r'\b\w+\b')),
        ('classifier', classifier)
    ])
    start_time = time.time()
    # Perform hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grids, scoring='accuracy', cv=5, n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    end_time = time.time()  # Record the end time
    total_training_time = end_time - start_time 
    # Save the best model using joblib
    best_model = grid_search.best_estimator_
    model_filename = f"{joblib_path}{type(classifier).__name__}_best_model.joblib"
    dump(best_model, model_filename)
    best_params = grid_search.best_params_
    # Calculate training accuracy
    train_accuracy_score = grid_search.best_score_
    train_accuracy_list_tuned.append({
        'Classifier': type(classifier).__name__,
        'Train Accuracy': train_accuracy_score,
        'Total Training Time (s)': total_training_time
    })

    # Make predictions on the test set
    predictions = grid_search.predict(X_test)

    # Get the best hyperparameters
    best_params = grid_search.best_params_

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = confusion.ravel()

    # Calculate metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    specificity = tn / (tn + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    # Append results to the list
    test_results_list_tuned.append({
        'Classifier': type(classifier).__name__,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        'Specificity': specificity,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Best Hyperparameters': best_params
    })
    
    train_predictions = grid_search.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)
    train_confusion = confusion_matrix(y_train, train_predictions)
    train_tn, train_fp, train_fn, train_tp = train_confusion.ravel()

    # Calculate metrics
    train_recall = train_tp / (train_tp + train_fn)
    train_precision = train_tp / (train_tp + train_fp)
    train_specificity = train_tn / (train_tn + train_fp)
    train_f1 = 2 * (train_precision * train_recall) / (train_precision + train_recall)
    # Append results to the list
    train_results_list_tuned.append({
        'Classifier': type(classifier).__name__,
        'Accuracy': train_accuracy,
        'Recall':train_recall,
        'Precision': train_precision,
        'F1 Score': train_f1,
        'Specificity': train_specificity,
        'TP': train_tp,
        'TN': train_tn,
        'FP': train_fp,
        'FN': train_fn
    })
    
    
    print(f"Test Accuracy for {type(classifier).__name__}: {accuracy}")
    print(f"Training Accuracy Score for {type(classifier).__name__}: {train_accuracy_score}")
    print(f"Training Accuracy all training data for {type(classifier).__name__}: {train_accuracy}")
    print(f"Best Parameters for {type(classifier).__name__}: {best_params}")
    print("\n")

# Convert the lists of dictionaries to DataFrames
test_results_df_tuned = pd.DataFrame(test_results_list_tuned)
train_accuracy_df_tuned = pd.DataFrame(train_accuracy_list_tuned)
train_accuracy_df_all_tuned  = pd.DataFrame(train_results_list_tuned)

MultinomialNB()
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Test Accuracy for MultinomialNB: 0.9174454828660437
Training Accuracy Score for MultinomialNB: 0.9181004812378166
Training Accuracy all training data for MultinomialNB: 0.9910296411856474
Best Parameters for MultinomialNB: {'classifier__alpha': 0.1, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 1)}


LogisticRegression(random_state=42)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Test Accuracy for LogisticRegression: 0.9595015576323987
Training Accuracy Score for LogisticRegression: 0.952416087962963
Training Accuracy all training data for LogisticRegression: 1.0
Best Parameters for LogisticRegression: {'classifier__C': 10.0, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 1)}


RandomForestClassifier(random_state=42)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Test Accuracy for RandomForestClassifier: 0.9485981308411215
Training Accuracy Score for RandomForestClassifier: 0.9

In [7]:
test_results_df_tuned.to_csv('TEST_ACCURACY_tuned_DEFINED_HYPERPARAMETER.csv')
train_accuracy_df_tuned.to_csv('TRAIN_ACCURACY_tuned_DEFINED_HYPERPARAMETER.csv')
train_accuracy_df_all_tuned.to_csv('TRAIN_ACCURACY_OVERALL_tuned_TRAIN_DATA_DEFINED_HYPERPARAMETER.csv')