In [5]:
import pandas as pd
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

# Carrega o dataset
df = pd.read_csv("dataset_final.csv")

# Define o tokenizer correto
tokenizer = SocialTokenizer(lowercase=True).tokenize

# Configura o Ekphrasis
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},
    fix_html=True,
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=tokenizer,
    dicts=[emoticons]
)

# Função para limpar texto
def clean_text(text):
    if pd.isna(text):
        return ""
    tokens = text_processor.pre_process_doc(text)
    return ' '.join(tokens)

# Aplica a limpeza
df['text'] = df['text'].astype(str).apply(clean_text)

# Salva o resultado
df.to_csv("dataset_ekphrasis.csv", index=False)



Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [6]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

df = pd.read_csv("dataset_final.csv")

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)



# Função para limpar texto
def clean_text(text):
    if pd.isna(text):
        return ""
    tokens = text_processor.pre_process_doc(text)
    return ' '.join(tokens)

# Aplica a limpeza
df['text'] = df['text'].astype(str).apply(clean_text)

# Salva o resultado
df.to_csv("dataset_ekphrasis.csv", index=False)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
