In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Cross support for windows and linux
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

data_path = os.path.join(project_root, 'data', 'raw', 'fake_and_real_news.csv')
# data_path = os.path.join(project_root, 'data', 'processed', 'processed_data.csv')

model_destination = os.path.join(project_root, 'models', 'gbc_tuned.pk1')

In [2]:
data = pd.read_csv(data_path)

In [3]:
# Convert to binary labels
mapping = {'Fake': 1, 'Real': 0}
data['label_binary'] = data['label'].map(mapping)

# Remove Duplicate values
data.drop_duplicates(subset="Text", inplace=True)

In [4]:
# Split dataset into training and testing set
from sklearn.model_selection import train_test_split

X = data['Text']
y = data['label_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=2024)

In [5]:
from src.pipelines import create_gbc_pipe

gbc_pipe = create_gbc_pipe()

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [.01, .05, .1],
    'classifier__max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(gbc_pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 4, 'classifier__n_estimators': 200}
Best Cross-Validation Accuracy: 0.9850832729905864
