In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Load the preprocessed data
def load_cleaned_data(file_to_load):
    return pd.read_csv(file_to_load, sep='\t')

In [None]:
# Split the data into train and test portions
def split(X, y):
  # 80% train and 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

In [None]:
def tfidf(X_train, X_test):
  pass

In [None]:
# Uses MultinomialNB for the baseline model
def naive_bayes(X_train_tfidf, X_test_tfidf, y_train):
  model = MultinomialNB()
  model.fit(X_train_tfidf, y_train)
  y_pred = model.predict(X_test_tfidf)
  return model, y_pred

In [None]:
# get the metrics for accuracy, precision, recall, f1 and roc curve
def metrics(model, X_test, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    precision = precision_score(y_test, y_pred, average='binary')
    print(f"Precision: {precision:.2f}")

    recall = recall_score(y_test, y_pred, average='binary')
    print(f"Recall: {recall:.2f}")

    f1 = f1_score(y_test, y_pred)
    print(f"F1: {f1:.2f}")

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='black', linestyle='--', linewidth=2, label='Random Chance')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for Hyperpartisan Classification')
    plt.legend()
    plt.show()


In [None]:
# Load the data
data = load_cleaned_data("../data/byarticle_clean.tsv")
X = data['full_text']
y = data['label']

# split data into train and test parts
X_train, X_test, y_train, y_test = split(X, y)

# vectorize with tfidf
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

# run the model
model_tfidf, y_pred_tfidf = naive_bayes(X_train_tfidf, X_test_tfidf, y_train)

# print the metrics for the model
print("\nMetrics for Naive Bayes model:")
metrics(model_tfidf, X_test_tfidf, y_test, y_pred_tfidf)