In [1]:
import re
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
from datetime import datetime, timezone
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import spacy



nltk.download("rslp")

[nltk_data] Downloading package rslp to /home/marvin-
[nltk_data]     linux/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [2]:
def stemming(texto: str) -> str:
    stemmer = nltk.stem.RSLPStemmer()
    palavras = []
    palavras = " ".join([stemmer.stem(palavra) for palavra in texto.split()])
    return palavras

def lematization(texto: str) -> str:
    nlp = spacy.load("pt_core_news_sm")
    doc = nlp(texto)
    texto = " ".join([token.lemma_ for token in doc])
    return texto



def remover_stop_words(texto: str) -> str:
    stopwords = nltk.corpus.stopwords.words("portuguese")
    print(stopwords)
    texto = " ".join(list(filter(lambda x: x not in stopwords, texto.split())))
    return texto


def formatar_texto(texto: str) -> str:
    texto = (
        re.sub(r"(http\S+)|(@\w+)", "", texto)  # remove links, usuários #
        .replace(".", "")
        .replace(";", "")
        .replace("—", "")
    )

    texto = re.sub(r"(  +)", " ", texto)  # remove espaços duplos
    texto = texto.lower().strip()

    return texto

In [4]:
df_raw = pd.read_csv("../data/raw/NoThemeTweets.csv").drop(
    columns=["id", "query_used"], axis=1  # remove colunas desnecessárias
)
df_raw.drop_duplicates(["tweet_text"], inplace=True)

df_raw["tweet_text"] = df_raw["tweet_text"].apply(
    lambda tweet: formatar_texto(texto=tweet)
)  # formata texto do dataframe

df_raw["sentiment"] = df_raw["sentiment"].replace({"Positivo": 1, "Negativo": 0})

df_raw = df_raw.assign(
    number_words=df_raw.tweet_text.apply(lambda x: len(x.split(" "))),
)  # adiciona coluna com número de palavras


formated_df_raw = df_raw.drop(
    df_raw[df_raw.number_words < 5].index
)  # remove tweets com menos de 5 palavras


formated_df_raw.drop(columns=["tweet_date","number_words"], inplace=True)

formated_df_raw.head(50)

Unnamed: 0,tweet_text,sentiment
0,14 para eu ir :),1
1,o meu like eu já dei na época :),1
2,eu só queria conseguir comer alguma coisa pra ...,1
3,:d que lindo dia !,1
4,"pq da pr jeito!!é uma ""oferta"", ha q aproveita...",1
5,"eu entendi, mas isso foi mais porque eu pensei...",1
6,[+] carcinoma hepatico (cancer de figado) deiv...,1
7,"aquela mina da limpeza, que tinha um marido co...",1
8,"aqui, espero que você melhore logo :)",1
9,se és feliz trabalhando nisso então não pare s...,1


In [5]:
df_no_stopwords = formated_df_raw.copy()

df_steamed_no_stopwords = formated_df_raw.copy()

df_steamed_no_stopwords["tweet_text"] = df_no_stopwords["tweet_text"].apply(
    lambda tweet: stemming(texto=tweet)
)

df_lemmetized_no_stopwords = formated_df_raw.copy()

df_lemmetized_no_stopwords["tweet_text"] = df_no_stopwords["tweet_text"].apply(
    lambda tweet: lematization(texto=tweet)
)



In [None]:
df_lemmetized_no_stopwords.head(10)

In [None]:
df_steamed_no_stopwords.head(10)

In [None]:
df_with_stopwords = formated_df_raw.copy()

df_with_stopwords["tweet_text"] = df_with_stopwords["tweet_text"].apply(
    lambda tweet: remover_stop_words(texto=tweet)
)

df_steamed_with_stopwords = formated_df_raw.copy()

df_steamed_with_stopwords["tweet_text"] = df_with_stopwords["tweet_text"].apply(
    lambda tweet: stemming(texto=tweet)
)

df_lemmetized_with_stopwords = formated_df_raw.copy()

df_lemmetized_with_stopwords["tweet_text"] = df_with_stopwords["tweet_text"].apply(
    lambda tweet: lematization(texto=tweet)
)

In [None]:
df_steamed_with_stopwords.head(10)

In [None]:
df_lemmetized_with_stopwords.head(10)

In [None]:
print(df_steamed_no_stopwords.shape,df_lemmetized_no_stopwords.shape,df_steamed_with_stopwords.shape,df_lemmetized_with_stopwords.shape)

In [None]:
df_steamed_no_stopwords.to_csv('../data/processed/df_steamed_no_stopwords.csv', index=False)
df_lemmetized_no_stopwords.to_csv('../data/processed/df_lemmetized_no_stopwords.csv', index=False)
df_steamed_with_stopwords.to_csv('../data/processed/df_steamed_with_stopwords.csv', index=False)
df_lemmetized_with_stopwords.to_csv('../data/processed/df_lemmetized_with_stopwords.csv', index=False)