In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import time
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Adjust the path to the project root (assuming the notebook is two levels deep in the folder structure)
project_root = os.path.abspath(os.path.join(current_dir, '..', '..'))

# Add the project root to the Python path
sys.path.append(project_root)

from clean_data import clean_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Carregar e Pré-processar o Dataset

In [2]:
# Carregar o dataset
csv_path = os.path.abspath(os.path.join(current_dir, '..', '..', 'datasets', 'trainEN.csv'))

df = pd.read_csv(csv_path)
# Combinar colunas de título e texto
df['combined_text'] = df['title'] + " " + df['text']
df['clean_text'] = df['combined_text'].apply(clean_text)

# Exibir as primeiras linhas do dataframe
df.head()

Unnamed: 0,id,title,author,text,label,combined_text,clean_text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,house dem aide didnt even see comeys letter ja...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",flynn hillary clinton big woman campus breitba...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...,truth might get fired truth might get fired oc...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...,15 civilian killed single u airstrike identifi...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1,Iranian woman jailed for fictional unpublished...,iranian woman jailed fictional unpublished sto...


## Vetorização dos Textos

In [3]:
# Vetorizar o dataset
#vectorizer = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))
X = df['clean_text']
y = df['label']

# Exibir a forma da matriz resultante
X.shape

(20166,)

## Divisão dos Dados em Treino e Teste

In [4]:
# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Exibir a forma dos conjuntos de treino e teste
X_train.shape, X_test.shape

((14116,), (6050,))

## Definição e Treinamento da Pipeline

In [5]:
# Definição da pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', SVC(kernel='linear'))
])

# Treino do modelo
start_time = time.time()
pipeline.fit(df['clean_text'], df['label'])
print(f'Tempo total de execução: {time.time() - start_time:.2f} segundos')

KeyboardInterrupt: 

## Avaliação do Modelo

In [None]:
# Previsão e avaliação
y_pred = pipeline.predict(df['clean_text'])
accuracy = accuracy_score(df['label'], y_pred)
print(f'Precisao do modelo: {accuracy:.2f}')

# Relatório de classificação
print(classification_report(df['label'], y_pred))

# Definição do k-fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Executar cross-validation
cross_val_scores = cross_val_score(pipeline, df['clean_text'], df['label'], cv=cv, scoring='accuracy')
print(f'Precisão média do cross-validation: {cross_val_scores.mean():.4f}')
print(f'Desvio padrão do cross-validation: {cross_val_scores.std():.4f}')


Precisao do modelo: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10384
           1       1.00      1.00      1.00      9782

    accuracy                           1.00     20166
   macro avg       1.00      1.00      1.00     20166
weighted avg       1.00      1.00      1.00     20166
Precisão média do cross-validation: 0.94
Desvio padrão do cross-validation: 0.00


## Grid Search

In [6]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', SVC())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'tfidf__max_features': [3000, 5000, 7000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__C': [0.1, 1, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(df['clean_text'], df['label'])

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation accuracy: {best_score:.2f}")

# Predict using the best estimator
y_pred = grid_search.best_estimator_.predict(df['clean_text'])
accuracy = accuracy_score(df['label'], y_pred)
print(f"Accuracy on the full dataset: {accuracy:.2f}")

# Classification report
print(classification_report(df['label'], y_pred, digits=4))


Best parameters: {'classifier__C': 10, 'classifier__kernel': 'rbf', 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
Best cross-validation accuracy: 0.96
Accuracy on the full dataset: 1.00
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     10384
           1     1.0000    1.0000    1.0000      9782

    accuracy                         1.0000     20166
   macro avg     1.0000    1.0000    1.0000     20166
weighted avg     1.0000    1.0000    1.0000     20166



In [7]:
print(classification_report(df['label'], y_pred, digits=4))
#plot confusion matrix

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     10384
           1     1.0000    1.0000    1.0000      9782

    accuracy                         1.0000     20166
   macro avg     1.0000    1.0000    1.0000     20166
weighted avg     1.0000    1.0000    1.0000     20166



In [8]:
import joblib
joblib.dump(grid_search.best_estimator_, 'clf.pkl') 

['clf.pkl']

In [9]:
classifier = joblib.load('clf.pkl')



In [None]:
#type(classifier)
classifier.predict(X_test)
