In [1]:
import polars as pl
import pandas as pd
from utils import *

In [2]:
data_folder = '../data/'
train_folder = data_folder + '/train/'

unlabel_filename = data_folder + 'final_database/unlabel.parquet'
positive = data_folder + 'final_database/positive.parquet'

cols = ['id', 'text']

# Train

## Positive

In [3]:
pos = pd.read_parquet(positive)
print('Total positive tweets: ', len(pos))
pos['created_at'] = pd.to_datetime(pos['created_at'])
pos['author_id'] = pd.to_numeric(pos['author_id'])
# get min and max date for pos
min_date = pos['created_at'].min()
max_date = pos['created_at'].max()
print('Min date: ', min_date)
print('Max date: ', max_date)

Total positive tweets:  36241
Min date:  2016-07-09 23:32:01+00:00
Max date:  2023-05-24 19:59:20+00:00


In [4]:
pos = delete_tweets(pos)
pos = pos[pos['created_at'] < '2023-01-01']
total_pos = len(pos)
print('Total positive tweets after deleting: ', total_pos)

After removing Fogo Cruzado tweets: (36241, 45)
After removing duplicates texts: (33078, 45)
After removing retweets: (33074, 45)
After removing replies: (28404, 45)
After removing is temporarily unavailable: (28379, 45)
After removing @fogocruzadoapp: (28301, 45)
After removing fogocruzado: (28280, 45)
After removing #FogoCruzadoRJ: (26280, 45)
Total positive tweets after deleting:  24353


## Negative train

In [5]:
pl.read_parquet(unlabel_filename).shape

(12803338, 9)

In [6]:
unlabel = pl.read_parquet(unlabel_filename).with_columns([
    pl.col("created_at").str.strptime(pl.Datetime,
                                      format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False)
])

# print min and max date for unlabel
min_date = unlabel['created_at'].min()
max_date = unlabel['created_at'].max()
print('Min date: ', min_date)
print('Max date: ', max_date)

  pl.col("created_at").str.strptime(pl.Datetime,


Min date:  2020-12-05 16:31:51
Max date:  2023-05-07 23:59:58


In [None]:
unlabel = pl.read_parquet(unlabel_filename).with_columns([
    pl.col("created_at").str.strptime(pl.Datetime,
                                      format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False)
]).filter(pl.col("created_at") < pl.datetime(2023, 1, 1)).sample(fraction=.5, seed=RANDOM_STATE_SEED).to_pandas()

In [6]:
unlabel = delete_tweets(unlabel)

After removing Fogo Cruzado tweets: (5208454, 9)
After removing duplicates texts: (3121062, 9)
After removing retweets: (2874420, 9)
After removing replies: (1639636, 9)
After removing is temporarily unavailable: (1638210, 9)
After removing @fogocruzadoapp: (1638205, 9)
After removing fogocruzado: (1638183, 9)
After removing #FogoCruzadoRJ: (1638179, 9)


In [7]:
unlabel.id.isin(pos.id).value_counts()

False    1633565
True        4614
Name: id, dtype: int64

In [8]:
unlabel = unlabel[~unlabel.id.isin(pos.id)]

unlabel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1633565 entries, 3 to 5208845
Data columns (total 9 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0                        1633565 non-null  object        
 1   id                   1633565 non-null  object        
 2   text                 1633565 non-null  object        
 3   created_at           1633565 non-null  datetime64[ns]
 4   url                  1633565 non-null  object        
 5   author_id            1633565 non-null  object        
 6   in_reply_to_user_id  0 non-null        object        
 7   user_location        1126743 non-null  object        
 8   geo.place_id         52435 non-null    object        
dtypes: datetime64[ns](1), object(8)
memory usage: 124.6+ MB


In [9]:
location_cols = ['user_location', 'geo.place_id']

# filter only empty location
unlabel = unlabel[unlabel[location_cols].isna().all(
    axis=1)].reset_index(drop=True).copy()

unlabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497319 entries, 0 to 497318
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0                        497319 non-null  object        
 1   id                   497319 non-null  object        
 2   text                 497319 non-null  object        
 3   created_at           497319 non-null  datetime64[ns]
 4   url                  497319 non-null  object        
 5   author_id            497319 non-null  object        
 6   in_reply_to_user_id  0 non-null       object        
 7   user_location        0 non-null       object        
 8   geo.place_id         0 non-null       object        
dtypes: datetime64[ns](1), object(8)
memory usage: 34.1+ MB


In [10]:
# sample tweets
unlabel = unlabel.sample(n=total_pos*3, random_state=RANDOM_STATE_SEED).reset_index(drop=True).copy()

unlabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73059 entries, 0 to 73058
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0                        73059 non-null  object        
 1   id                   73059 non-null  object        
 2   text                 73059 non-null  object        
 3   created_at           73059 non-null  datetime64[ns]
 4   url                  73059 non-null  object        
 5   author_id            73059 non-null  object        
 6   in_reply_to_user_id  0 non-null      object        
 7   user_location        0 non-null      object        
 8   geo.place_id         0 non-null      object        
dtypes: datetime64[ns](1), object(8)
memory usage: 5.0+ MB


## Merge train

In [11]:
train_pos = pos[cols].copy()

train_pos['label'] = 1

train_pos

Unnamed: 0,id,text,label
966,1569648211488346114,Tá tudo dominado no Grande Rio: a milícia quin...,1
967,1555601122349424640,Teve muito tiroteio no Campinho e Cascadura no...,1
1618,1609332609091358729,"Tiroteio começando, a treta vindo, todo mundo ...",1
1619,1609323695796797441,"Toma-lhe tiro pro alto, munição infinita meus ...",1
1620,1609314236907376643,"É fogos, é chuva, é tiro, é trovão, fechando o...",1
...,...,...,...
35584,816792652263936001,PM morre e duas pessoas são baleadas em assalt...,1
35586,816731693247844358,Só hj 3 pessoas foram baleadas no #ComplexoDoA...,1
35587,816703992310984704,"Um policial foi baleado na #Fazendinha, no #Co...",1
35590,804036809156132867,"Entre 22 e 26.11, #Caxias foi a área que mais ...",1


In [12]:
train_neg = unlabel[cols].copy()

train_neg['label'] = 0

train_neg

Unnamed: 0,id,text,label
0,1409001088074891264,meu deus eu prefiro um tiro do que viver essa ...,0
1,1607742659275198465,não consigo fazer um dump do mês pq eu não tir...,0
2,1563282433063018503,hoje eu tô mais perdido que cego em tiroteio p...,0
3,1456641025397645312,"Coloco óculos meus olhos doem, tiro meus olhos...",0
4,1418699717790375936,Homem morre no HUT após ser baleado com vários...,0
...,...,...,...
73054,1603553920680435712,essa frase ta na minha bio do insta e eu nao t...,0
73055,1349169579164102659,"Troca tiro sendo frio e calculista, MAJOR JÁ É...",0
73056,1433212455493242881,Graças a deus essa chata me tiro do cf dela,0
73057,1575783634358874113,ve la se sou louca de levar tiro na rua vou to...,0


In [4]:
24353+73059

97412

In [13]:
train = pd.concat([train_pos, train_neg]).reset_index(drop=True)

train['text'] = train.text.apply(cleantxt)

In [14]:
train.value_counts('label', normalize=True)

label
0    0.75
1    0.25
dtype: float64

In [15]:
train.sample(10)

Unnamed: 0,id,text,label
44909,1495610575270862853,Eu nunca voto em paredão mas nesse aí eu vou v...,0
47914,1541535442859921408,Escolher Braga Neto como vice só comprova o ta...,0
11690,1382156161898713089,Do nada deram muito tiro aqui perto de casa m...,1
37153,1385429280180654082,É fogos ou tiro essa porra Kkkkk,0
69780,1533894992086896641,amo vê foto das pessoas sorrindo mas ao mesmo...,0
35988,1556144961397170176,Não tem nada melhor que tá na paz consigo mesm...,0
17545,1196473253252026369,Só Jesus cara tô acabei de ouvir tiro e tô so...,1
33035,1496860111633014785,Só os sonhos q me faz acordar querendo levar u...,0
44176,1574521324931801100,qnd o gringo me chama de babe eu fico toda fel...,0
49343,1337937777061732352,Matheus Carrieri deu um tiro no próprio pé mo...,0


In [16]:
train.to_csv(train_folder + 'train.csv', index=False)