In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

import time
from clean_data import clean_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Carregar e Pré-processar o Dataset

In [2]:
# Carregar o dataset
df = pd.read_csv('./data/trainEN.csv')

# Combinar colunas de título e texto
df['combined_text'] = df['title'] + " " + df['text']
df['clean_text'] = df['combined_text'].apply(clean_text)

# Exibir as primeiras linhas do dataframe
df.head()

Unnamed: 0,id,title,author,text,label,combined_text,clean_text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...,house dem aide didnt even see comeys letter ja...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",flynn hillary clinton big woman campus breitba...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...,truth might get fired truth might get fired oc...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...,15 civilian killed single u airstrike identifi...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1,Iranian woman jailed for fictional unpublished...,iranian woman jailed fictional unpublished sto...


## Vetorização dos Textos

In [3]:
# Vetorizar o dataset
vectorizer = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Exibir a forma da matriz resultante
X.shape

(20166, 5000)

## Divisão dos Dados em Treino e Teste

In [4]:
# Dividir o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Exibir a forma dos conjuntos de treino e teste
X_train.shape, X_test.shape

((14116, 5000), (6050, 5000))

## Definição e Treinamento da Pipeline

In [5]:
# Definição da pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', SVC(kernel='linear'))
])

# Treinamento do modelo
start_time = time.time()
pipeline.fit(df['clean_text'], df['label'])
print(f'Tempo total de execução: {time.time() - start_time:.2f} segundos')

Tempo total de execução: 207.95 segundos


## Avaliação do Modelo

In [6]:
# Previsão e avaliação
y_pred = pipeline.predict(df['clean_text'])
accuracy = accuracy_score(df['label'], y_pred)
print(f'Precisao do modelo: {accuracy:.2f}')

# Relatório de classificação
print(classification_report(df['label'], y_pred))

Acurácia do modelo: 0.99
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10384
           1       0.99      0.99      0.99      9782

    accuracy                           0.99     20166
   macro avg       0.99      0.99      0.99     20166
weighted avg       0.99      0.99      0.99     20166

