## imports

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV,cross_val_score,KFold,ShuffleSplit,cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report,roc_auc_score,roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from skopt import BayesSearchCV
from matplotlib import pyplot as plt
import numpy as np
from IPython.display import  Markdown
import joblib

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Utils

In [3]:
path = "../data/interim/"

In [4]:

def cross_score_v2(data_X,data_Y,data_name,estimator,estimator_name,estimator_params):
    scores = ["recall","accuracy","f1"]
    print("data name: ", data_name)
    
    print("model name: ",estimator_name)
    
    model_gs = BayesSearchCV(estimator, search_spaces=estimator_params, scoring='accuracy')

    results = cross_validate(model_gs,data_X,data_Y, scoring=scores, cv=ShuffleSplit(n_splits=3, test_size=0.2, random_state=42))
    results["model_name"] = [f"{estimator_name}-{data_name}"] * len(results["score_time"])
    

    return results

In [5]:
#modificar
def build_score(y_pred,y_true,probs):
    print("Acuracia : {}%".format(round(accuracy_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("F1_score: {}%".format(round(f1_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("Precison: {}%".format(round(precision_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("Recall: {}%".format(round(recall_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("-"*20)
    print('Classification Report')
    print(classification_report(y_test,y_pred,target_names=["FAKE","TRUE"]))
    print("-"*20)
    print("Plot curva roc")
    lr_auc = roc_auc_score(y_true, probs[:, 1])
    print('ROC AUC=%.3f' % (lr_auc))
    fpr, tpr, _ = roc_curve(y_true, probs[:, 1])
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

In [6]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]


In [7]:
estimators = [
    (
        "RF",
        RandomForestClassifier(),
        {
            "n_estimators": [300,800,1200],
            "max_depth": [5, 8, 15, 25],
            "max_features":['log2', 'sqrt']
        }
    ),
    ( 
        "LR",
        LogisticRegression(solver='liblinear', max_iter=10000),
        {
            "penalty": ['l1', 'l2'],
            "C":[0.001,0.01,0.1,1]
        }
    ),
     (
         "Tree",
          DecisionTreeClassifier(),
         {
            'criterion' : ['gini', 'entropy'],
            'max_depth' : [6,8,10,12],
            'max_features':[2,4,6,8]

         }
     ),
     (
         "ADA" ,
         AdaBoostClassifier(),
         {
            'n_estimators':[500,800,1200], 
            'learning_rate':[0.001, 0.01, 0.1, 1]
         }
     )
]

Rodar e guardar o resultado de cada dado separado, modificar amanhã.

In [None]:
# função para comprar os modelos
def compara_modelos(datasets:list, estimators:list):
    # compara os modelos por dataset, já tunando os hyperparametros compara o melhor com melhor
    results={}
    for data_name,data_path in datasets:
        vetorizar = TfidfVectorizer(lowercase=False, max_features=200)#mex_features -> 300
        df = pd.read_csv(data_path)
        # dividir dataset para validar o modelo após o treino.
        bag_of_words_ = vetorizar.fit_transform(df["text"])
        X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
        y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
        for estimator_name, estimator_obj, estimator_params in estimators:

            model_results = cross_score_v2(X,y,data_name,estimator_obj,estimator_name,estimator_params)
            if results:
                for key, value in model_results.items():
                    results[key] = np.append(results[key], value)
            else:
                results = model_results
                
                
    #guarda o resultado da comparação
    df_results = pd.DataFrame(results)
    results = (
        pd
        .DataFrame(df_results)
        .groupby(['model_name'])
        .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])
        .transpose()
        .reset_index()
        .rename(columns={"level_0": "score"})
        .drop(columns="level_1")
            # .set_index('score')
    )
    # estiliza o dataframe deixando em cinza o melhor modelo
    time_scores = ['fit_time', 'score_time']
    winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
    results.columns.name = ''
    results = (
        results
        .style
        .hide(axis='index')
        .apply(highlight_max, props='color:white;background-color:gray', axis=1)
    )
    display(results)
    display(Markdown(f'O melhor modelo é o : **{winner}**'))
    # escolhe o melhor modelo
    model_winner = winner.split("-")[0]
    data_winner= winner.split("-")[1]
    model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
    data_name, data_path  = [foo for foo in datasets if foo[0] == data_winner][0]
    
    # treina o melhor modelo com todos os textos
    vetorizar = TfidfVectorizer(lowercase=False, max_features=200)
    df = pd.read_csv(data_path)
    bag_of_words_ = vetorizar.fit_transform(df["text"])
    X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
    y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
    
    # tuna os hyperparametros do melhor modelo
    model_BS = BayesSearchCV(model, search_spaces=model_params, scoring='accuracy')
    model_BS.fit(X, y)
    
    # Salvando modelo treinado
    path =  f"../models/model_{model.__class__.__name__}_{data_name}.joblib"
    joblib.dump(model_BS,path)

In [None]:
datasets_lemma = [
    ("no_stop_words_lemma","../data/interim/no_stopWords_lemma.csv"),
    ("with_stop_words_lemma","../data/interim/with_stopWords_lemma.csv")
]

datasets_stemma = [
    ("no_stop_words_stemma","../data/interim/no_stopWords_stemma.csv"),
    ("with_stop_words_stemma","../data/interim/with_stopWords_stemma.csv")
]

data name:  no_stop_words_lemma
model name:  RF
data name:  no_stop_words_lemma
model name:  LR
data name:  no_stop_words_lemma
model name:  Tree
data name:  no_stop_words_lemma
model name:  ADA


In [1]:
# results={}
# for data_name,data_path in datasets:
#     vetorizar = TfidfVectorizer(lowercase=False, max_features=200)#mex_features -> 300
#     df = pd.read_csv(data_path)
#     bag_of_words_ = vetorizar.fit_transform(df["text"])
#     X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
#     y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
#     for estimator_name, estimator_obj, estimator_params in estimators:

#         model_results = cross_score_v2(X,y,data_name,estimator_obj,estimator_name,estimator_params)
#         if results:
#             for key, value in model_results.items():
#                 results[key] = np.append(results[key], value)
#         else:
#             results = model_results


# df_results = pd.DataFrame(results)



# results = (
#     pd
#     .DataFrame(df_results)
#     .groupby(['model_name'])
#     .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
#     .transpose()
#     .reset_index()
#     .rename(columns={"level_0": "score"})
#     .drop(columns="level_1")
#     # .set_index('score')
# )
# time_scores = ['fit_time', 'score_time']
# winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
# results.columns.name = ''
# results = (
#     results
#     .style
#     .hide(axis='index')
#     .apply(highlight_max, props='color:white;background-color:gray', axis=1)
# )
# display(results)
# display(Markdown(f'O melhor modelo é o : **{winner}**'))



# # Realizando treino do modelo completo
# model_winner = winner.split("-")[0]
# data_winner= winner.split("-")[1]
# model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
# data_name, data_path  = [foo for foo in datasets if foo[0] == data_winner][0]

# vetorizar = TfidfVectorizer(lowercase=False, max_features=50)
# df = pd.read_csv(data_path)
# bag_of_words_ = vetorizar.fit_transform(df["text"])
# X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
# y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()

# model_gs = GridSearchCV(model, model_params, scoring='accuracy')
# model_gs.fit(X, y)


# # Salvando modelo treinado
# joblib.dump(model_gs, '../models/model.joblib')