In [1]:
import pandas as pd

In [2]:
import sklearn

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

## Carregar os dados para o Dataframe

In [6]:
df = pd.read_csv("../../../dados/nlp/news_sentiment_analysis.csv", encoding="utf-8")

## Pegou apenas as colunas necessárias

In [8]:
dados = df[["Title", "Sentiment"]]

## Criar uma nova coluna com um número correspondente ao texto do sentimento

In [10]:
dados["sentimento_number"] = dados["Sentiment"].replace(['positive', 'negative', 'neutral'], [1, 0, 0])

  dados["sentimento_number"] = dados["Sentiment"].replace(['positive', 'negative', 'neutral'], [1, 0, 0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados["sentimento_number"] = dados["Sentiment"].replace(['positive', 'negative', 'neutral'], [1, 0, 0])


## Criar o Vetorizador por frequencia com as 1000 palavras mais utilizadas

In [12]:
encoder = CountVectorizer(binary=True, max_features = 1000)

## Treinar com o Vetorizador para os textos de entrada

In [14]:
resultado = encoder.fit_transform(dados["Title"])
resultado.shape

(3500, 1000)

## Pega o dicionario com as palavras mais utilizadas

In [16]:
colunas = encoder.get_feature_names_out()

## Transforma a matrix densa gerada pelo Vetorizador em uma matrix sparsa

In [18]:
count_sparsed = pd.DataFrame.sparse.from_spmatrix(resultado, columns=colunas)
count_sparsed

Unnamed: 0,00,000,038,10,100,11,12,13,14,15,...,workers,world,worth,wwe,year,years,york,you,your,zu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3497,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3498,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Separa os dados de entrada (da matrix sparsa) e os dados de label em treinamento e testes

In [20]:
X_train = count_sparsed[0:2700]

In [21]:
X_test = count_sparsed[2700:]

In [22]:
Y_train = dados['sentimento_number'][0:2700]

In [23]:
Y_test = dados['sentimento_number'][2700:]

## Cria o mecanismo de Regressão Logistica

In [25]:
reg_log = LogisticRegression()

## Treina o mecanismo de regressão com base nos dados de treino

In [27]:
reg_log.fit(X_train, Y_train)

## Testa a acurácia do modelo 

In [29]:
acuracia = reg_log.score(X_test, Y_test)
acuracia

0.80875

## Faz previsão usando uma frase de exemplo

In [31]:
frase = "US FTC issues warning to franchisors over unfair business practices"

In [32]:
encoder_previsao = CountVectorizer(binary=True, vocabulary=colunas)

In [33]:
frase_densa = encoder_previsao.fit_transform( [frase] )

In [34]:
frase_sparsed = pd.DataFrame.sparse.from_spmatrix(frase_densa, columns=colunas)
frase_sparsed

Unnamed: 0,00,000,038,10,100,11,12,13,14,15,...,workers,world,worth,wwe,year,years,york,you,your,zu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
reg_log.predict( frase_sparsed )

array([1], dtype=int64)