In [1]:
import numpy as np
import pandas as pd
import emoji
import re
from enelvo.normaliser import Normaliser
from collections import Counter
import spacy
# !python -m spacy download pt_core_news_sm
import langdetect

from nltk.probability import FreqDist

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 80)

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('portuguese'))

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()



[nltk_data] Downloading package stopwords to /home/andre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Helper Functions

In [2]:
def get_language(s):
    try:
        return langdetect.detect(s)
    except:
        return np.nan

### Users

In [3]:

# Treat Users
def treat_users(tweet_tkz, frequent_users):
    '''
    Apply functions to treat users
    '''
    tweet_tkz = remove_ats_frequet_users(tweet_tkz, frequent_users)
    return remove_users(tweet_tkz)



def get_users_rank(corpus, limit = 40):
    '''
    Returns rank (list of tuples) of most the frequent users in the corpus
    '''
    frame = pd.DataFrame({'corpus': corpus})
    frame = frame[frame.corpus.str.contains('@')]
    frame['at_'] = frame.corpus.apply(lambda doc: [word for word in doc.split() if '@' in word])
    flatten = [num for elem in frame.at_.tolist() for num in elem]
    rank = Counter(flatten).most_common(limit)
    
    return rank



def get_frequent_users(corpus, limit=40):
    rank = get_users_rank(corpus, limit)
    return [i for i,j in rank]



def remove_ats_frequet_users(tweet_tkz, frequent_users):
    '''
    Remove ats from frequent users, so they are kept in the corpus.
    '''
    intersection_set = set(tweet_tkz).intersection(set(frequent_users))
    if len(intersection_set) != 0 :
        for user in intersection_set:
            tweet_tkz = [token.replace('@','')  if token == user else token for token in tweet_tkz]
    return tweet_tkz


def remove_users(tweet_tkz):
    '''
    Remove users with @.
    '''
    tweet = re.sub('@[^\s]+','user',' '.join(tweet_tkz))
    tweet_tkz = [token for token in tweet.split(' ') if token != 'user']
    return tweet_tkz









### Emojis

In [4]:
import json

# Processing Emojis

def remove_brs(tweet_tkz, replacement = 'brasil'):
    '''
    After tokenazation 🇧🇷 are turned into '🇧', '🇷'. Replacing them for "brasil".
    '''
    for i, _ in enumerate(tweet_tkz):
        if tweet_tkz[i] in ('🇧','🇷'):
            if i+1 <= len(tweet_tkz):
                if tweet_tkz[i+1] in ('🇧','🇷'):
                    tweet_tkz[i] = replacement
                    tweet_tkz[i+1] = ''
    tweet_tkz = [i for i in tweet_tkz if i != '']
    return tweet_tkz


# Emojis


def treat_emojis(tweet_tkz, top_rank_emjs):
    '''
    Replaces or removes emojis from tokenized tweet.
    The result depends on the frequency emojis rank and available translations. 
    '''
    all_emjs = emoji.UNICODE_EMOJI['pt'] 

    new_tweet_tkz = []
    for token in tweet_tkz:
        if token in top_rank_emjs:
            new_tweet_tkz.append(top_rank_emjs[token])
        elif token in all_emjs:

            pass
        else:
            new_tweet_tkz.append(token)
    return new_tweet_tkz




def get_top_rank_emjs(corpus, limit=40, filename='../../data/3-preprocessed/emojis_dict.json'):
    '''
    Return translation dictionary for the most frequent emojis
    '''
    emojs_translation = get_emojis_words(corpus, limit, filename)
    top_rank_emjs = [i for i, _ in get_emojis_rank(corpus)]
    return { k:v for k,v in  emojs_translation.items() if k in top_rank_emjs} 




def get_emojis_words(corpus, limit=40, filename='../../data/3-preprocessed/emojis_dict.json'):
    '''
    Return a dict mapping emojis and their respective word to be replaced in the documents.
    '''
    with open(filename, 'r') as file:
        translations = json.load(file)
    
    emjs_dict = emoji.UNICODE_EMOJI['pt'] 

    return { emoji: translations.get(item.replace(':',''))
            for emoji, item in emjs_dict.items() 
            if translations.get(item.replace(':','')) is not None 
        }
    



def get_emojis_rank(corpus, limit=40):
    '''
    Rank the most frequent emotions in the corpus.
    Return a list of tuples.
    '''
    list_of_lists = [extract_emojis(token) for token in corpus if not isinstance(extract_emojis(token), float)]
    flat_list = [item for sublist in list_of_lists for item in sublist]
    c = Counter(flat_list)
    return c.most_common(limit)


def extract_emojis(tweet):
    '''
    Return a list of emotions in a tweet.
    '''
    emjs_dict = emoji.UNICODE_EMOJI['pt'] 
    emjs = []
    tweet_tkz = tknzr.tokenize(tweet)
    for token in tweet_tkz:
        if token in emjs_dict:
            emjs.append(token)
    if len(emjs)==0:
        return np.nan
    return emjs






In [5]:


def remove_stopwords(tweet_tkz, filename = '../../data/3-preprocessed/stopwords.json'):
    ''' 
    Load stopwords list from file and remove them from tokenized tweets.
    '''
    with open(filename, 'r') as file:
        stopwords = json.load(file)
        
    stopwords = nltk_stopwords.union(set(stopwords))
    return [token for token in tweet_tkz if token.lower() not in stopwords]



def treat_kkk(tweet_tkz):
    ''' 
    Replace laugh kkkk for "risada".
    '''
    return [ 'risada'  if token in ['kkk'+i*'k' for i in range(25)] else token for token in tweet ]


### Split joined words

In [6]:

def split_joined_words(token):
    '''
    Rertuns a list of splited tokens if splitable.
    '''
    if not token.isupper():
        return re.sub( r"([A-Z])", r" \1", token).split()

    return [token]


def tokenize_joined_words(tweet_tkz):
    ''' 
    Tokenize words without spaces.
    '''
    new_tweet_tkz = []
    for token in tweet_tkz:
        token = split_joined_words(token)
        for sub_token in token:
            new_tweet_tkz.append(sub_token)
    
    return new_tweet_tkz


# PROCESSING WHOLE DATA

In [7]:
from IPython.display import clear_output



# Load Data

reload_data = False

if reload_data:
    twt = pd.read_parquet('../../data/1-raw/tweets.parquet')
    places = pd.read_parquet('../../data/1-raw/places.parquet')
    users = pd.read_parquet('../../data/1-raw/users.parquet')


    twt['is_retweet'] = np.where(twt.text.str.contains('RT @'),1,0)
    twt['created_at_date'] = twt.created_at.dt.date
    twt['created_at_time'] = twt.created_at.dt.time

    twt['emoji'] = twt.text.apply(lambda x: extract_emojis(x))
    twt.emoji = twt.emoji.apply(lambda x: ''.join(x) if isinstance(x, list) else x)

    twt['lang'] = twt.text.apply(lambda x: get_language(x))
    twt.to_pickle('../../data/2-intermediate/tweets.pkl')

else:
    twt = pd.read_pickle('../../data/2-intermediate/tweets.pkl')
    df = twt[twt.is_retweet == 0]
    df = df.drop(columns=['import_date','file_name'])
    df = df[df.lang=='pt']












corpus = df.text.tolist()
top_rank_emjs = get_top_rank_emjs(corpus)
frequent_users = get_frequent_users(corpus)

norm = Normaliser(tokenizer='readable')
spc = spacy.load("pt_core_news_sm")


bag = []
for i, tweet in enumerate(corpus):
    clear_output(wait=True)
    print(round(i/len(corpus)*100,2), " %")

    tknzr = TweetTokenizer()

    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',tweet)

    tweet = tweet.replace('R$','reais').replace('-','').replace('url','')
    tweet = tknzr.tokenize(tweet)
    tweet = treat_users(tweet, frequent_users)

    tweet = treat_emojis(tweet, top_rank_emjs)

    tweet = remove_brs(tweet)

    tweet = remove_stopwords(tweet)

    tweet = re.sub(r'[^\w\s]','',' '.join(tweet)).split(' ') # Usar por ultimo pois retira emojis
    tweet = [token for token in tweet if token != ''] # remove empty strings

    tweet = [token.lemma_.lower() for token in spc(' '.join(tweet))]
    
    tweet = treat_kkk(tweet)

    tweet = tokenize_joined_words(tweet)

    tweet = norm.normalise(' '.join([token.lower() for token in tweet])).split(' ')

    tweet = remove_stopwords(tweet)

    tweet = [token if token != 'suboficial' else 'lulaoficial' for token in tweet ]
    tweet = [token if token != 'vagar' else 'vagabundo' for token in tweet ]
    tweet = [token if token != 'firsar' else 'risada' for token in tweet ]

    bag.append(tweet)


df_processed = pd.DataFrame({'text':bag})
df_processed.to_pickle('../../data/2-intermediate/corpus.plk')

df_processed = pd.concat([df.reset_index(drop=True), df_processed.rename(columns={'text':'tokens'})], axis=1)
df_processed.to_parquet('../../data/2-intermediate/tweets.parquet')

99.99  %


In [8]:
df_processed

Unnamed: 0,id,created_at,author_id,text,place_id,subject,is_retweet,created_at_date,created_at_time,emoji,lang,tokens
0,1542153434534641665,2022-06-29 14:30:05+00:00,1251632043764703245,@Jouberth19 #com Bolsonaro juntos,97bcdfca1a2dca59,bolsonaro,0,2022-06-29,14:30:05,,pt,"[bolsonaro, junto]"
1,1542153432106164226,2022-06-29 14:30:04+00:00,1341510046929346566,"Toyjo: “Desde o início do governo Bolsonaro, Brasil recebeu cerca de R$ ... https://t.co/lADVL6POXT via @YouTube",,bolsonaro,0,2022-06-29,14:30:04,,pt,"[tojo, desde, início, governo, bolsonaro, brasil, receber, cerca, real, youtube]"
2,1542153431401521153,2022-06-29 14:30:04+00:00,1469115605257048064,@Eduardomg_95 Isso aqui que é fazer um Governo sem roubalheira ou corrupção. Governo Bolsonaro! Bolsonaro reeleito. https://t.co/64Wx5beI4Q,,bolsonaro,0,2022-06-29,14:30:04,,pt,"[aqui, governo, roubalheira, corrupção, governo, bolsonaro, bolsonaro, reeleger]"
3,1542153431137361923,2022-06-29 14:30:04+00:00,42459902,DESMENTINDO BOLSONARO Serviço de Utilidade Pública https://t.co/pv8ekLSo7W,68e019afec7d0ba5,bolsonaro,0,2022-06-29,14:30:04,,pt,"[desmentindo, bolsonaro, serviço, utilidade, pública]"
4,1542153429098831873,2022-06-29 14:30:03+00:00,1395125418651435011,"@UOLNoticias Acho engraçado os bolsominions justificando que é pra atingir o governo Bolsonaro. Faz o seguinte, põe sua mãe ou sua filha pra trabalhar com ele então e deixa ele colocar as mãos nas partes íntimas delas, depois justifica que elas estão reclamando pra atingir o governo Bolsonaro",,bolsonaro,0,2022-06-29,14:30:03,,pt,"[monotipia, achar, engraçar, homofóbicos, justificar, atingir, governo, bolsonaro, faz, seguinte, pôr, mãe, filha, trabalhar, deixar, colocar, mão, partes, íntima, justificar, reclamar, atingir, governo, bolsonaro]"
...,...,...,...,...,...,...,...,...,...,...,...,...
18642,1544447539071619073,2022-07-05 22:26:02+00:00,346927346,@lsentoes1 Será que o mito também não estaria pensando assim? O Lula veio na frente e Jair acabou perdendo o contato. E olha que o Jair sair primeiro heim.,,lula,0,2022-07-05,22:26:02,,pt,"[mito, pensar, assim, lula, frente, jair, acabar, perder, contato, olhar, jair, sair, primeiro, heim]"
18643,1544447519601479680,2022-07-05 22:25:57+00:00,58943137,@Anaceli65806423 @ThiagoResiste Pelo mesmo motivo ao qual o Lula não foi preso antes.. imunidade parlamentar. Mas não se preocupe. Todos eles dividirão celas em breve...lula dorme embaixo e bozo em cima.,,lula,0,2022-07-05,22:25:57,,pt,"[thiagoresiste, motivo, lula, prender, antes, imunidade, parlamentar, preocupe, dividir, cela, breve, lula, dorme, embaixo, bozó, cima]"
18644,1544447517965926401,2022-07-05 22:25:57+00:00,1443602071,Coloca guardiões do Lula nisso #STFOrganizacaoCriminosa https://t.co/lJAy31dbe9,,lula,0,2022-07-05,22:25:57,,pt,"[colocar, guardião, lula, stforganizacaocriminoso]"
18645,1544447516057501696,2022-07-05 22:25:56+00:00,727669276207923200,@JanainaDoBrasil Eu tb elogio o presidente Lula assim.,,lula,0,2022-07-05,22:25:56,,pt,"[janainadobrasil, elogio, presidente, lula, assim]"
