In [307]:
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import urllib3

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alfredo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [308]:
df = pd.read_csv("database-news.csv")

In [309]:
df.head()

Unnamed: 0.1,Unnamed: 0,noticia,label
0,0,Kátia Abreu diz que vai colocar sua expulsão e...,fake
1,1,Blog esquerdista dá a entender que reclamar de...,fake
2,2,"Alckmin diz que por ele PSDB desembarca, mas...",fake
3,3,Cara de pau não tem limites: Zé Celso aciona M...,fake
4,4,Temer resolve o problema de Luislinda: liberd...,fake


In [310]:
df.tail()

Unnamed: 0.1,Unnamed: 0,noticia,label
7195,7195,Ficou longe das notícias no fim de ano? Veja o...,True
7196,7196,A nova denúncia contra o ex-presidente Luiz I...,True
7197,7197,Como a Bahia virou uma potência mundial do mar...,True
7198,7198,"Alvo da Lava Jato, Bendine tinha passagem comp...",True
7199,7199,Chefs convidados do Encontro Mundial das Cidad...,True


In [311]:
df2 = pd.read_csv('CoronaTest.csv')

In [312]:
df2.head()

Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,1
1,Beber água de 15 em 15 minutos cura o coronavírus,1
2,Chá de limão com bicarbonato quente cura coron...,1
3,Ministro da Saúde pede para compartilhar áudio...,1
4,"Aplicativo Coronavírus-SUS, do Governo do Bras...",1


In [313]:
df2 = df2[['noticia', 'classificacao']]
df2 = df2.rename(columns={"noticia": "noticia", "classificacao": "label"})
df2['label'] = df2['label'].replace({0: 'true', 1: 'fake'})
df2.head()
df = df.append(df2, ignore_index=True)


In [314]:
x_train,x_test,y_train,y_test=train_test_split(df['noticia'], df['label'], test_size=0.2, random_state=42)

In [315]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('portuguese'),
                                analyzer='word',
                                ngram_range=(1, 1),
                                lowercase=True,
                                use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

In [316]:
rf = RandomForestClassifier(random_state=0)
rf.fit(tfidf_train,y_train)

y_pred = rf.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 95.29%


In [317]:
model = SVC()
model.fit (tfidf_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [323]:
predictions = model.predict(tfidf_test)

In [324]:
print(confusion_matrix(y_test, predictions))

[[715  20]
 [ 43 667]]


In [325]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        fake       0.94      0.97      0.96       735
        true       0.97      0.94      0.95       710

    accuracy                           0.96      1445
   macro avg       0.96      0.96      0.96      1445
weighted avg       0.96      0.96      0.96      1445



In [326]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}

In [327]:
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=5)
grid.fit(tfidf_train, y_train)
grid.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................ C=0.1, kernel=linear, score=0.939, total=  42.4s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.3s remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.934, total=  43.1s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.946, total=  43.0s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.930, total=  43.3s
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.945, total=  43.3s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.959, total=  24.9s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.962, total=  24.7s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.962, total=  25.0s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.959, total=  24.9s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.965, total=  25.2s
[CV] C=10, kernel=linear .............................................
[CV] ................. C=10, kernel=linear, score=0.955, total=  24.3s
[CV] C=10, kernel=linear .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 12.6min finished


{'C': 1, 'kernel': 'linear'}

In [328]:
grid_predictions = grid.predict(tfidf_test)
print(confusion_matrix(y_test, grid_predictions))

[[717  18]
 [ 32 678]]


In [329]:
print(classification_report(y_test,  grid_predictions))

              precision    recall  f1-score   support

        fake       0.96      0.98      0.97       735
        true       0.97      0.95      0.96       710

    accuracy                           0.97      1445
   macro avg       0.97      0.97      0.97      1445
weighted avg       0.97      0.97      0.97      1445



In [330]:
df3 = pd.read_csv('CoronaTest.csv')

In [331]:
df3

Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,1
1,Beber água de 15 em 15 minutos cura o coronavírus,1
2,Chá de limão com bicarbonato quente cura coron...,1
3,Ministro da Saúde pede para compartilhar áudio...,1
4,"Aplicativo Coronavírus-SUS, do Governo do Bras...",1
5,Café não previne o coronavírus,0
6,Beber água de 15 em 15 minutos cura o coronavírus,1
7,China anuncia vacina para coronavírus,1
8,Todos os países adotam as mesmas medidas para ...,1
9,O Brasil teve 968 novas mortes registradas em ...,0


In [332]:
tfidf_target = tfidf_vectorizer.transform(df3['noticia'])

In [334]:
result_target = grid.predict(tfidf_target)
print(result_target)

['fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake'
 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake'
 'fake' 'fake' 'fake' 'fake']


In [335]:
for i in range(len(df3['noticia'])):
    if result_target[i]=='fake' :
        df3['classificacao'][i] = False
    else:
        df3['classificacao'][i] = True
    
display(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,False
1,Beber água de 15 em 15 minutos cura o coronavírus,False
2,Chá de limão com bicarbonato quente cura coron...,False
3,Ministro da Saúde pede para compartilhar áudio...,False
4,"Aplicativo Coronavírus-SUS, do Governo do Bras...",False
5,Café não previne o coronavírus,False
6,Beber água de 15 em 15 minutos cura o coronavírus,False
7,China anuncia vacina para coronavírus,False
8,Todos os países adotam as mesmas medidas para ...,False
9,O Brasil teve 968 novas mortes registradas em ...,False
