#TRATAMENTO DE DADOS

## IMPORTAÇÕES

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
#biblioteca para remover link
import re
from sklearn.utils import resample

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#importando e salvando o dataset
dataset = pd.read_csv("data.csv")

In [None]:
#pegando as noticias e salvando em uma nova variavel
noticia = dataset.Sentence

In [None]:
#pegando os sentimentos e salvando em uma nova variavel
sentimento = dataset.Sentiment

In [None]:
#remoção de noticias paralelamente com o sentimento atrelado
boole = noticia.duplicated()

noticia_nao_repetida = []
sentimento_nao_repetido = []

for i in range(len(noticia)):
  if not boole[i]:
    noticia_nao_repetida.append(noticia[i])
    sentimento_nao_repetido.append(sentimento[i])

In [None]:
def remove_links(sentence):
    # Padrão de expressão regular para identificar links
    link_pattern = re.compile(r'https?://\S+|www\.\S+')

    # Substitui os links por uma string vazia
    sentence_without_links = re.sub(link_pattern, '', sentence)

    return sentence_without_links

# Exemplo de uso
noticia_sem_links = [remove_links(sentenca) for sentenca in noticia_nao_repetida]
noticia_sem_links

["The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model .",
 '$ESI on lows, down $1.50 to $2.50 BK a real possibility',
 "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .",
 'According to the Finnish-Russian Chamber of Commerce , all the major construction companies of Finland are operating in Russia .',
 'The Swedish buyout firm has sold its remaining 22.4 percent stake , almost eighteen months after taking the company public in Finland .',
 "$SPY wouldn't be surprised to see a green close",
 "Shell's $70 Billion BG Deal Meets Shareholder Skepticism",
 'SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANGE RELEASE OCTOBER 14 , 2008 AT 2:45 PM The Company updates its full year outlook and 

In [None]:
#tokenização
noticia_tokens = [nltk.word_tokenize(noticia) for noticia in noticia_sem_links]

In [None]:
#normalização das noticia
#declarando a lista noticia_normalizado com seu tamanho para depois povoar com
#as palavras normalizadas
noticia_normalizado = [None]*len(noticia_tokens)
for i in range (len(noticia_tokens)):
  noticia_normalizado[i] = [word.lower() for word in noticia_tokens[i] if word.isalnum()]

In [None]:
noticia_normalizado[1]

['esi', 'on', 'lows', 'down', 'to', 'bk', 'a', 'real', 'possibility']

In [None]:
#normalização dos sentimentos
sentimento_normalizado = [word.lower() for word in sentimento_nao_repetido if word.isalpha]

In [None]:
#remover as stop words
noticia_filtrado = [None]*len(noticia_normalizado)
stop_words = set(stopwords.words('english'))
for i in range (len(noticia_normalizado)):
  noticia_filtrado[i] = [token.lower() for token in noticia_normalizado[i] if not token in stop_words]

In [None]:
#lematização das noticias
lematizador = WordNetLemmatizer()
noticia_lematizado = [None]*len(noticia_filtrado)
for i in range (len(noticia_tokens)):
  noticia_lematizado[i] = [lematizador.lemmatize(token) for token in noticia_filtrado[i]]

In [None]:
novo_dataset = pd.DataFrame({'Sentence':noticia_lematizado, 'Sentiment':sentimento_normalizado})

In [None]:
#Undersampling
counts = novo_dataset['Sentiment'].value_counts()


# Encontrar o sentimento com o menor número de instâncias
sentimento_minimo = counts.idxmin()
# Encontrar o sentimento com o maior número de instâncias
sentimento_maximo = counts.idxmax()

# Dividir o DataFrame em subconjuntos por classe
df_minimo = novo_dataset[novo_dataset['Sentiment'] == sentimento_minimo]
df_maximo = novo_dataset[novo_dataset['Sentiment'] == sentimento_maximo]

# Realizar undersampling na classe majoritária
df_majority_downsampled = resample(df_maximo, replace=False, n_samples=counts[sentimento_minimo], random_state=42)

# Combinar os subconjuntos balanceados
df_balanceado = pd.concat([df_majority_downsampled, df_minimo])

# Exibir o DataFrame balanceado
print(df_balanceado)



                                               Sentence Sentiment
866   [customer, include, company, energy, process, ...   neutral
2643     [investment, worth, approximately, eur, 100mn]   neutral
4371  [honkarakenne, also, decided, yesterday, sell,...   neutral
2920  [company, said, offshore, segment, represented...   neutral
2016  [vaisala, present, weather, detector, measure,...   neutral
...                                                 ...       ...
5296                    [fb, hitting, everything, twtr]  negative
5298                                  [bobe, premarket]  negative
5303                                   [acad, 2, today]  negative
5313         [sbux, pm, db, downgrade, pt, cut, 70, 64]  negative
5317  [hsbc, say, unit, book, 585, million, charge, ...  negative

[1184 rows x 2 columns]


In [None]:
df_balanceado['Sentiment'].value_counts()

neutral     592
negative    592
Name: Sentiment, dtype: int64

neutral     2878
positive    1852
negative     592
Name: Sentiment, dtype: int64