In [1]:
import polars as pl
import pandas as pd
from utils import *

In [2]:
data_folder = '../data/'
test_folder = data_folder + '/test/'

unlabel_filename = data_folder + 'final_database/combined_search.parquet'
positive = data_folder + 'final_database/positive.parquet'

cols = ['id','text']

# Test

## Positive

In [3]:
pos = pd.read_parquet(positive)
print('Initial positive tweets: ', len(pos))
pos = delete_tweets(pos)
pos = pos[pos['created_at'] >= '2023-01-01']
pos['created_at'] = pd.to_datetime(pos['created_at'])
pos['author_id'] = pd.to_numeric(pos['author_id'])

total_pos = len(pos)
print('Final positive tweets: ', total_pos)

Initial positive tweets:  36241
After removing Fogo Cruzado tweets: (31632, 45)
After removing duplicates texts: (28661, 45)
After removing retweets: (28657, 45)
After removing replies: (26080, 45)
After removing is temporarily unavailable: (26055, 45)
After removing @fogocruzadoapp: (25978, 45)
After removing fogocruzado: (25977, 45)
After removing #FogoCruzadoRJ: (25975, 45)
Final positive tweets:  1909


## Negative train

In [4]:
unlabel = pl.read_parquet(unlabel_filename).with_columns([
    pl.col("created_at").str.strptime(pl.Datetime,
                                      format="%Y-%m-%dT%H:%M:%S.%fZ", strict=False)
]).filter(pl.col("created_at") > pl.datetime(2023, 1, 1)).to_pandas()

  pl.col("created_at").str.strptime(pl.Datetime,


In [5]:
unlabel = delete_tweets(unlabel)

After removing Fogo Cruzado tweets: (6807, 46)
After removing duplicates texts: (6776, 46)
After removing retweets: (6776, 46)
After removing replies: (5047, 46)
After removing is temporarily unavailable: (5046, 46)
After removing @fogocruzadoapp: (5046, 46)
After removing fogocruzado: (5046, 46)
After removing #FogoCruzadoRJ: (5046, 46)


In [6]:
unlabel.id.isin(pos.id).value_counts()

False    4815
True      231
Name: id, dtype: int64

In [7]:
unlabel = unlabel[~unlabel.id.isin(pos.id)]

## Merge train

In [8]:
test_pos = pos[cols].copy()

test_pos['label'] = 1

test_pos

Unnamed: 0,id,text,label
0,1644522101624193032,"Todo dia tiroteio gente, que issu",1
1,1644456817009762305,Tiro demais Jesus,1
2,1644454575208890369,Do nadaaa um tiroteio fudido que isso,1
3,1644455365046681600,quanto tiro 😰,1
4,1644442932286464002,crc tiro pra kct 🤦,1
...,...,...,...
36235,1661336677967691777,Muito tiro tô passando mal,1
36236,1661321502422081537,Ir trabalhar no meio do tiroteio é o fim cara,1
36238,1661192288251379712,eita quanto tiro,1
36239,1661188662778814464,"Tiro pra crl, que isso mano toda hora",1


In [9]:
test_neg = unlabel[cols].copy()

test_neg['label'] = 0

test_neg

Unnamed: 0,id,text,label
2,1642302547108003849,Uma vez com 15a tava andando na S.Pena +- 13h ...,0
4,1642298712058724355,Tô com a minha auto estima tão baixa ultimamen...,0
6,1642298020250230786,"É por isso que eu tiro pra nada , nem pra 1 real",0
8,1642297846501195776,19:48 e o tiro comendo solto na praça seca 😩,0
9,1642288647679426561,"Hoje e tiro,porrada e bomba. \nPra cima deles ...",0
...,...,...,...
9137,1653582995368935425,"Foi chuva de tiro, que medo 🥺",0
9139,1653571114214277120,A vida do suburbano carioca é turow. Ouvimos t...,0
9140,1653567457708191744,A ordem é pra dar tiro e não pra recuar. \nEss...,0
9143,1653562692370681859,Twitter só aparece página de gente se batendo ...,0


In [10]:
test = pd.concat([test_pos, test_neg]).reset_index(drop=True)

test['text'] = test.text.apply(cleantxt)


In [11]:
test.label.value_counts()


0    4815
1    1909
Name: label, dtype: int64

In [12]:
test.sample(10)

Unnamed: 0,id,text,label
1840,1660405075850690563,Que isso ta rolando tiro em Niterói,1
5116,1614226858077487105,Tiro o Spotify boto o Youtube premium Tiro o ...,0
2101,1640503547098800129,Eu tiro umas fotos que somente algumas pessoas...,0
4629,1618427264353587201,Um dos motivos dos meus surtos nesse fds foi e...,0
1509,1649056332052578308,ouvi foi nada de tiro,1
2850,1633666222695784449,Meu Deus mt tiro,0
5003,1615418117353779203,Tiro forças de onde não tem se puder matar eu...,0
790,1624832903728271360,Geral curtindo o domingão eu aguentando traba...,1
3107,1632420106474082307,na comida eu tiro onda tá doido kk,0
3141,1632024975472316416,Tiro pra caralho pra mulher mais inteligente ...,0


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6724 entries, 0 to 6723
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6724 non-null   object
 1   text    6724 non-null   object
 2   label   6724 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 157.7+ KB


In [14]:
test.to_csv(test_folder + 'test_raw.csv', index=False)