# Baseline ML model

In [44]:
import mlflow
import sys
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.metrics import classification_report, auc, accuracy_score, f1_score, recall_score, precision_score, roc_curve


sys.path.append("../")

# repository code
from libs import configs

In [4]:
data = pd.read_csv(configs.PREPROCESSED_DATA, sep="\t")

In [7]:
data["fake"] = np.where(data["label"] == "F", 1, 0)

In [8]:
data

Unnamed: 0,index,label,title,filename,title_length,news,fake
0,n0,F,Hacer gargaras con agua y sal elimina el coron...,../data/data_fake_news/fake/F - Hacer gargaras...,52,Beber mucha agua y hacer gárgaras con agua cal...,1
1,n1,F,Helicópteros no rociarán desinfectante contra ...,../data/data_fake_news/fake/F - Helicópteros n...,68,Helicópteros no rociarán desinfectante contra ...,1
2,n2,F,Nostradamus predijo el COVID19 y lo describió ...,../data/data_fake_news/fake/F - Nostradamus pr...,59,Nostradamus predijo el COVID19 y lo describió ...,1
3,n3,F,Nostradamus predijo el COVID19 y lo describió ...,../data/data_fake_news/fake/F - Nostradamus pr...,60,Nostradamus predijo el COVID19 y lo describió ...,1
4,n4,F,Sostener la respiración por 10 segundos no ayu...,../data/data_fake_news/fake/F - Sostener la re...,85,Sostener la respiración por 10 segundos no ayu...,1
...,...,...,...,...,...,...,...
528,n528,V,"_Unidos para seguir cuidándote_, la nueva estr...",../data/data_fake_news/true/V-_Unidos para seg...,68,"'Unidos para seguir cuidándote', la nueva estr...",0
529,n529,V,_Vacuna contra covid en EE. UU. no estará disp...,../data/data_fake_news/true/V-_Vacuna contra c...,68,'Vacuna contra covid en EE. UU. no estará disp...,0
530,n530,V,"‘Datos del covid tienen rezagos, no falseamien...",../data/data_fake_news/true/V-‘Datos del covid...,69,"‘Datos del covid tienen rezagos, no falseamien...",0
531,n531,V,‘Este virus muestra cuán vulnerables somos’_Pe...,../data/data_fake_news/true/V-‘Este virus mues...,60,‘Este virus muestra cuán vulnerables somos’: P...,0


In [41]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
mlflow.set_tracking_uri(configs.MLFLOW_TRACKING_URI)
mlflow.sklearn.autolog()




In [46]:
def train_baseline(random_state=1121218):
    """
    A function to train simple models on the metadata.
    """
    X_train, X_test, y_train, y_test = train_test_split(data['news'], data['fake'], random_state = 0)

    clf = GaussianNB()

    with mlflow.start_run():
        
        # model training
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(X_train)
        tfidf_transformer = TfidfTransformer()
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
        clf.fit(X_train_tfidf.toarray(), y_train)
        test_data_transformed = tfidf_transformer.transform(count_vect.transform(X_test)).toarray()
        
        test_predictions = clf.predict(test_data_transformed )
        test_probabilities = clf.predict_proba(test_data_transformed)[:,1]
        
        fpr, tpr, thresholds = roc_curve(y_test, test_probabilities)
        
        accuracy_score_ = accuracy_score(y_test, test_predictions)
        f1_score_ = f1_score(y_test, test_predictions)
        recall_score_ = recall_score(y_test, test_predictions)
        precision_score_ = precision_score(y_test, test_predictions)
        AUC = auc(fpr, tpr)
        
    mlflow.end_run()

In [47]:
train_baseline()