## imports

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV,cross_val_score,KFold,ShuffleSplit,cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report,roc_auc_score,roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from skopt import BayesSearchCV
from matplotlib import pyplot as plt
import numpy as np
from IPython.display import  Markdown
import joblib

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Utils

In [3]:
path = "../data/interim/"

In [4]:

def cross_score_v2(data_X,data_Y,data_name,estimator,estimator_name,estimator_params):
    scores = ["recall","accuracy","f1"]
    print("data name: ", data_name)
    
    print("model name: ",estimator_name)
    
    model_gs = BayesSearchCV(estimator, search_spaces=estimator_params, scoring='accuracy')

    results = cross_validate(model_gs,data_X,data_Y, scoring=scores, cv=ShuffleSplit(n_splits=3, test_size=0.2, random_state=42))
    results["model_name"] = [f"{estimator_name}-{data_name}"] * len(results["score_time"])
    

    return results

In [5]:
#modificar
def build_score(y_pred,y_true,probs):
    print("Acuracia : {}%".format(round(accuracy_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("F1_score: {}%".format(round(f1_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("Precison: {}%".format(round(precision_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("Recall: {}%".format(round(recall_score(y_pred=y_pred,y_true=y_true)*100,3)))
    print("-"*20)
    print('Classification Report')
    print(classification_report(y_test,y_pred,target_names=["FAKE","TRUE"]))
    print("-"*20)
    print("Plot curva roc")
    lr_auc = roc_auc_score(y_true, probs[:, 1])
    print('ROC AUC=%.3f' % (lr_auc))
    fpr, tpr, _ = roc_curve(y_true, probs[:, 1])
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

In [6]:
def highlight_max(s, props=''):
    values = [float(value.split()[0]) for value in s.values[1:]]
    result = [''] * len(s.values)
    if s.values[0].endswith('time'):
        result[np.argmin(values)+1] = props
    else:
        result[np.argmax(values)+1] = props
    return result

def get_winner(s):
    metric = s.values[0]
    values = [float(value.split()[0]) for value in s.values[1:]]
    models = results.columns[1:]
    
    if s.values[0].endswith('time'):
        return models[np.argmin(values)]
    else:
        return models[np.argmax(values)]


In [7]:
estimators = [
    (
        "RF",
        RandomForestClassifier(),
        {
            "max_depth": [5, 8, 15, 25],
            "max_features":['log2', 'sqrt']
        }
    ),
    ( 
        "LR",
        LogisticRegression(solver='liblinear', max_iter=10000),
        {
            "penalty": ['l1', 'l2'],
            "C":[0.001,0.01,0.1,1]
        }
    ),
     (
         "Tree",
          DecisionTreeClassifier(),
         {
            'criterion' : ['gini', 'entropy'],
            'max_depth' : [6,8,10,12],
            'max_features':[2,4,6,8]

         }
     )
]

Rodar e guardar o resultado de cada dado separado, modificar amanhã.

In [8]:
# # função para comprar os modelos
# def compara_modelos(datasets:list, estimators:list):
#     # compara os modelos por dataset, já tunando os hyperparametros compara o melhor com melhor
#     results={}
#     for data_name,data_path in datasets:
#         vetorizar = TfidfVectorizer(lowercase=False, max_features=200)#mex_features -> 300
#         df = pd.read_csv(data_path)
#         # dividir dataset para validar o modelo após o treino.
#         df_train = df.sample(n=6000)
#         bag_of_words_ = vetorizar.fit_transform(df_train["text"])
#         X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
#         y = df_train["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
#         for estimator_name, estimator_obj, estimator_params in estimators:

#             model_results = cross_score_v2(X,y,data_name,estimator_obj,estimator_name,estimator_params)
#             if results:
#                 for key, value in model_results.items():
#                     results[key] = np.append(results[key], value)
#             else:
#                 results = model_results
                
                
#     #guarda o resultado da comparação
#     df_results = pd.DataFrame(results)
#     results = (
#         pd
#         .DataFrame(df_results)
#         .groupby(['model_name'])
#         .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])
#         .transpose()
#         .reset_index()
#         .rename(columns={"level_0": "score"})
#         .drop(columns="level_1")
#             # .set_index('score')
#     )
#     # estiliza o dataframe deixando em cinza o melhor modelo
#     time_scores = ['fit_time', 'score_time']
#     winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
#     results.columns.name = ''
#     results = (
#         results
#         .style
#         .hide(axis='index')
#         .apply(highlight_max, props='color:white;background-color:gray', axis=1)
#     )
#     display(results)
#     display(Markdown(f'O melhor modelo é o : **{winner}**'))
#     # escolhe o melhor modelo
#     model_winner = winner.split("-")[0]
#     data_winner= winner.split("-")[1]
#     model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
#     data_name, data_path  = [foo for foo in datasets if foo[0] == data_winner][0]
    
#     # treina o melhor modelo com todos os textos
#     vetorizar = TfidfVectorizer(lowercase=False, max_features=200)
#     df = pd.read_csv(data_path)
#     df_train = df.sample(n=6000)
#     bag_of_words_ = vetorizar.fit_transform(df_train["text"])
#     X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
#     y = df_train["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
    
#     # tuna os hyperparametros do melhor modelo
#     model_BS = BayesSearchCV(model, search_spaces=model_params, scoring='accuracy')
#     model_BS.fit(X, y)
    
#     # Salvando modelo treinado
#     path =  f"../models/model_{model.__class__.__name__}_{data_name}.joblib"
#     joblib.dump(model_BS,path)

In [9]:
datasets = [
    ("no_stop_words_lemma","../data/interim/no_stopWords_lemma.csv"),
    ("no_stop_words_stemma","../data/interim/no_stopWords_stemma.csv"),
    ("with_stop_words_lemma","../data/interim/with_stopWords_lemma.csv"),
    ("with_stop_words_stemma","../data/interim/with_stopWords_stemma.csv")
]

In [10]:
# compara_modelos(datasets_no_stop, estimators)

In [11]:
# compara_modelos(datasets_with_Stop, estimators)

In [10]:
results={}
for data_name,data_path in datasets:
    vetorizar = TfidfVectorizer(lowercase=False, max_features=200)#mex_features -> 300
    df = pd.read_csv(data_path)
    # separando uma parte para  teste
    df_train = df.sample(n=6000, random_state=42)
    bag_of_words_ = vetorizar.fit_transform(df_train["text"])
    X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
    y = df_train["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
    for estimator_name, estimator_obj, estimator_params in estimators:

        model_results = cross_score_v2(X,y,data_name,estimator_obj,estimator_name,estimator_params)
        if results:
            for key, value in model_results.items():
                results[key] = np.append(results[key], value)
        else:
            results = model_results


df_results = pd.DataFrame(results)



results = (
    pd
    .DataFrame(df_results)
    .groupby(['model_name'])
    .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
    .transpose()
    .reset_index()
    .rename(columns={"level_0": "score"})
    .drop(columns="level_1")
    # .set_index('score')
)
time_scores = ['fit_time', 'score_time']
winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
results.columns.name = ''
results = (
    results
    .style
    .hide(axis='index')
    .apply(highlight_max, props='color:white;background-color:gray', axis=1)
)
display(results)
display(Markdown(f'O melhor modelo é o : **{winner}**'))



# Realizando treino do modelo completo
model_winner = winner.split("-")[0]
data_winner= winner.split("-")[1]
model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
data_name, data_path  = [foo for foo in datasets if foo[0] == data_winner][0]

vetorizar = TfidfVectorizer(lowercase=False, max_features=50)
df = pd.read_csv(data_path).sample(frac=1, random_state=42)
#df_train = df.sample(n=6000)
bag_of_words_ = vetorizar.fit_transform(df["text"])
X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()

model_Bs = BayesSearchCV(model, search_spaces=model_params, scoring='accuracy')
model_Bs.fit(X, y)


# Salvando modelo treinado
path =  f"../models/model_{model.__class__.__name__}_{data_name}.joblib"
joblib.dump(model_Bs,path)
# Salvando o preprocessor
path = f"../models/preprocessor.joblib"
joblib.dump(bag_of_words_, path)

data name:  no_stop_words_lemma
model name:  RF
data name:  no_stop_words_lemma
model name:  LR
data name:  no_stop_words_lemma
model name:  Tree
data name:  no_stop_words_stemma
model name:  RF
data name:  no_stop_words_stemma
model name:  LR
data name:  no_stop_words_stemma
model name:  Tree
data name:  with_stop_words_lemma
model name:  RF
data name:  with_stop_words_lemma
model name:  LR
data name:  with_stop_words_lemma
model name:  Tree
data name:  with_stop_words_stemma
model name:  RF
data name:  with_stop_words_stemma
model name:  LR
data name:  with_stop_words_stemma
model name:  Tree


score,LR-no_stop_words_lemma,LR-no_stop_words_stemma,LR-with_stop_words_lemma,LR-with_stop_words_stemma,RF-no_stop_words_lemma,RF-no_stop_words_stemma,RF-with_stop_words_lemma,RF-with_stop_words_stemma,Tree-no_stop_words_lemma,Tree-no_stop_words_stemma,Tree-with_stop_words_lemma,Tree-with_stop_words_stemma
fit_time,98.596 ± 2.924,115.698 ± 9.819,211.645 ± 7.145,131.707 ± 8.549,309.997 ± 8.729,311.079 ± 11.182,374.757 ± 1.076,311.020 ± 11.178,109.047 ± 5.298,122.243 ± 4.629,88.335 ± 1.840,79.566 ± 2.277
score_time,0.011 ± 0.001,0.009 ± 0.002,0.007 ± 0.002,0.006 ± 0.000,0.034 ± 0.004,0.029 ± 0.002,0.027 ± 0.002,0.022 ± 0.001,0.005 ± 0.000,0.005 ± 0.000,0.004 ± 0.000,0.005 ± 0.000
test_recall,0.926 ± 0.008,0.930 ± 0.005,0.943 ± 0.004,0.946 ± 0.002,0.944 ± 0.003,0.944 ± 0.004,0.950 ± 0.002,0.949 ± 0.008,0.845 ± 0.017,0.846 ± 0.007,0.847 ± 0.005,0.833 ± 0.017
test_accuracy,0.934 ± 0.006,0.934 ± 0.006,0.920 ± 0.005,0.938 ± 0.004,0.952 ± 0.002,0.951 ± 0.003,0.950 ± 0.004,0.954 ± 0.005,0.860 ± 0.011,0.862 ± 0.003,0.867 ± 0.012,0.868 ± 0.012
test_f1,0.934 ± 0.005,0.935 ± 0.005,0.924 ± 0.006,0.939 ± 0.004,0.952 ± 0.003,0.952 ± 0.003,0.951 ± 0.004,0.954 ± 0.005,0.859 ± 0.009,0.863 ± 0.005,0.867 ± 0.013,0.864 ± 0.011


O melhor modelo é o : **RF-with_stop_words_stemma**

['../models/model_RandomForestClassifier_with_stop_words_stemma.joblib']

In [24]:
# df.sample(frac=1, random_state=42)

Unnamed: 0,text,category,label,site
3098,chor as pitang bonn post fras de dor de cotove...,tv_celebridades,fake,https://www.diariodobrasil.org
2531,na bah adolesc de 14 ano prev a própr mort e a...,tv_celebridades,fake,https://www.diariodobrasil.org
4071,segundaf 12 de març bom dia aqu est os princip...,sociedade_cotidiano,true,https://g1.globo.com
1287,a insan de lul se me prend eu vir heró se me d...,politica,fake,https://www.diariodobrasil.org
2540,crianç apanh de mulh petist só porqu us uma ca...,politica,fake,https://www.diariodobrasil.org
...,...,...,...,...
3772,segundaf 11 de setembr de 2017 boa noit aqu es...,politica,true,https://g1.globo.com
5191,rodrig fonsec quas um ano após sua estre no ci...,tv_celebridades,true,http://cultura.estadao.com.br
5226,quartaf 6 de setembr de 2017 boa noit aqu est ...,politica,true,https://g1.globo.com
5390,manifest mantêm protest em charlott após políc...,sociedade_cotidiano,true,http://internacional.estadao.com.br


In [26]:
# winner = 'RF-with_stop_words_stemma'
# # Realizando treino do modelo completo
# model_winner = winner.split("-")[0]
# data_winner= winner.split("-")[1]
# model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
# data_name, data_path  = [foo for foo in datasets if foo[0] == data_winner][0]

# vetorizar = TfidfVectorizer(lowercase=False, max_features=50)
# df = pd.read_csv(data_path).sample(frac=1, random_state=42)
# #df_train = df.sample(n=6000)
# bag_of_words_ = vetorizar.fit_transform(df["text"])
# X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
# y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()

# model_Bs = BayesSearchCV(model, search_spaces=model_params, scoring='accuracy')
# model_Bs.fit(X, y)


# # Salvando modelo treinado
# path =  f"../models/model_{model.__class__.__name__}_{data_name}.joblib"
# joblib.dump(model_Bs,path)
# # Salvando o preprocessor
# path = f"../models/preprocessor.joblib"
# joblib.dump(vetorizar, path)

['../models/preprocessor.joblib']

In [41]:
# from scipy.spatial.distance import cdist
# # df

# text_vet = vetorizar.transform("""Luiz Inácio Lula da Silva (PT) é o novo presidente da República.""".split(' '))

# dist_fake = cdist(text_vet.todense(), X[y==0]).mean(axis=1)
# dist_True = cdist(text_vet.todense(), X[y==1]).mean(axis=1)

# print(dist_fake)
# print('')
# print(dist_True)
# print('')
# text_vet.todense()

[1.         1.         1.         1.26803651 1.         1.
 1.         1.         1.         1.         1.26803651 1.        ]

[1.         1.         1.         1.22693474 1.         1.
 1.         1.         1.         1.         1.22693474 1.        ]



matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [15]:
# path = f"../models/preprocessor.joblib"

# joblib.dump(vetorizar, path)

['../models/preprocessor.joblib']

In [None]:
# results={}
# for data_name,data_path in datasets_with_Stop:
#     vetorizar = TfidfVectorizer(lowercase=False, max_features=200)#mex_features -> 300
#     df = pd.read_csv(data_path)
#     df_train = df.sample(n=6000)
#     bag_of_words_ = vetorizar.fit_transform(df_train["text"])
#     X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
#     y = df_train["label"].replace({"true":1,"fake":0}).to_numpy().ravel()
#     for estimator_name, estimator_obj, estimator_params in estimators:

#         model_results = cross_score_v2(X,y,data_name,estimator_obj,estimator_name,estimator_params)
#         if results:
#             for key, value in model_results.items():
#                 results[key] = np.append(results[key], value)
#         else:
#             results = model_results


# df_results = pd.DataFrame(results)



# results = (
#     pd
#     .DataFrame(df_results)
#     .groupby(['model_name'])
#     .agg([lambda x: f"{np.mean(x):.3f} ± {np.std(x):.3f}"])#
#     .transpose()
#     .reset_index()
#     .rename(columns={"level_0": "score"})
#     .drop(columns="level_1")
#     # .set_index('score')
# )
# time_scores = ['fit_time', 'score_time']
# winner = results.query('score not in @time_scores').apply(get_winner, axis=1).value_counts().index[0]
# results.columns.name = ''
# results = (
#     results
#     .style
#     .hide(axis='index')
#     .apply(highlight_max, props='color:white;background-color:gray', axis=1)
# )
# display(results)
# display(Markdown(f'O melhor modelo é o : **{winner}**'))

In [16]:
# Realizando treino do modelo completo
# model_winner = winner.split("-")[0]
# data_winner= winner.split("-")[1]
# model_name, model, model_params  = [foo for foo in estimators if foo[0] == model_winner][0]
# data_name, data_path  = [foo for foo in datasets_with_Stop if foo[0] == data_winner][0]

# vetorizar = TfidfVectorizer(lowercase=False, max_features=50)
# df = pd.read_csv(data_path)
# bag_of_words_ = vetorizar.fit_transform(df["text"])
# X = pd.DataFrame(bag_of_words_.toarray(),columns=vetorizar.get_feature_names_out())
# y = df["label"].replace({"true":1,"fake":0}).to_numpy().ravel()

# model_BS =  BayesSearchCV(model, search_spaces=model_params, scoring='accuracy')
# model_BS.fit(X, y)


# # Salvando modelo treinado
# path =  f"../models/model_{model.__class__.__name__}_{data_name}.joblib"
# joblib.dump(model_BS,path)

['../models/model_RandomForestClassifier_with_stop_words_lemma.joblib']