In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold



In [80]:
def get_data(dir= 'data', split= 'train', track= 'a', language= 'ptbr'):
    
    archive = language + '.csv' if split == 'train' else language + '_' + track + '.csv'

    path = f'{dir}/{split}/track_{track}/{archive}'
    
    return pd.read_csv(path)

In [81]:
track_a = get_data(track= 'a')
track_b = get_data(track= 'b')

In [82]:
#apply na linha completa do df

def estratifica_track_a(row):
    anger = '' if row['Anger'] == 0 else 'A '
    disgust = '' if row['Disgust'] == 0 else 'D ' 
    fear = '' if row['Fear'] == 0 else 'F '
    happiness = '' if row['Joy'] == 0 else 'H '
    sadness = '' if row['Sadness'] == 0 else 'S '
    
    return anger + disgust + fear + happiness + sadness



def estratifica_track_b(row):
    anger = '' if row['Anger'] == 0 else f'A_{row["Anger"]} '
    disgust = '' if row['Disgust'] == 0 else f'D_{row["Disgust"]} ' 
    fear = '' if row['Fear'] == 0 else f'F_{row["Fear"]} '
    happiness = '' if row['Joy'] == 0 else f'H_{row["Joy"]} '
    sadness = '' if row['Sadness'] == 0 else f'S_{row["Sadness"]}'
    
    return anger + disgust + fear + happiness + sadness

In [83]:
track_b['classes'] = track_b.apply(estratifica_track_b, axis= 1)

In [85]:
track_b

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise,classes
0,ptbr_train_track_b_00001,Essa é a democracia e liberdade que a e seu p...,1,0,0,0,0,0,A_1
1,ptbr_train_track_b_00002,fiz isso! vou ficar prestando atenção se o che...,0,0,0,0,0,0,
2,ptbr_train_track_b_00003,anao meu irmao ta em casa kk lixo,1,0,0,0,1,0,A_1 S_1
3,ptbr_train_track_b_00004,"Mas... a pandemia vem pra ficar, meu chapa. At...",0,0,2,0,0,0,F_2
4,ptbr_train_track_b_00005,Não entendi o post. Está enaltecendo a invasão...,2,1,0,0,0,0,A_2 D_1
...,...,...,...,...,...,...,...,...,...
2221,ptbr_train_track_b_02222,Vc merece muito mais,0,0,0,1,0,0,H_1
2222,ptbr_train_track_b_02223,"Também , maioria sem nada para fazer , aí acab...",0,0,0,0,0,0,
2223,ptbr_train_track_b_02224,acho que chegou a hora de esquecer os erros do...,0,0,0,0,0,0,
2224,ptbr_train_track_b_02225,mas é óbvio oq vc esperava,0,0,0,0,0,1,


In [86]:
rare_classes = track_b['classes'].value_counts()
rare_classes = rare_classes[rare_classes == 1].index

rare_data = track_b[track_b['classes'].isin(rare_classes)]

remaining_data = track_b[~track_b['classes'].isin(rare_classes)]

n_splits = 5 
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

track_b['folds'] = -1

for fold_number, (train_idx, test_idx) in enumerate(skf.split(remaining_data, remaining_data['classes'])):
    track_b.loc[remaining_data.iloc[test_idx].index, 'folds'] = fold_number

# Classes raras permanecem apenas no treino
track_b.loc[rare_data.index, 'folds'] = "rare_only"

# Verificar o resultado
print(track_b['folds'].value_counts())


folds
1            443
2            443
0            443
4            442
3            442
rare_only     13
Name: count, dtype: int64


  track_b.loc[rare_data.index, 'folds'] = "rare_only"


In [87]:
track_b = track_b.drop(columns= 'classes')
track_b['folds'].unique()

array([4, 1, 3, 2, 0, 'rare_only'], dtype=object)

In [88]:
track_b

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise,folds
0,ptbr_train_track_b_00001,Essa é a democracia e liberdade que a e seu p...,1,0,0,0,0,0,4
1,ptbr_train_track_b_00002,fiz isso! vou ficar prestando atenção se o che...,0,0,0,0,0,0,1
2,ptbr_train_track_b_00003,anao meu irmao ta em casa kk lixo,1,0,0,0,1,0,3
3,ptbr_train_track_b_00004,"Mas... a pandemia vem pra ficar, meu chapa. At...",0,0,2,0,0,0,3
4,ptbr_train_track_b_00005,Não entendi o post. Está enaltecendo a invasão...,2,1,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...
2221,ptbr_train_track_b_02222,Vc merece muito mais,0,0,0,1,0,0,0
2222,ptbr_train_track_b_02223,"Também , maioria sem nada para fazer , aí acab...",0,0,0,0,0,0,4
2223,ptbr_train_track_b_02224,acho que chegou a hora de esquecer os erros do...,0,0,0,0,0,0,3
2224,ptbr_train_track_b_02225,mas é óbvio oq vc esperava,0,0,0,0,0,1,4


In [89]:
track_a = pd.merge(track_a, track_b[['text', 'folds']], on= 'text', how= 'inner')

In [90]:
track_a

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise,folds
0,ptbr_train_track_a_00001,"minha vó me disse que era frango e eu comi, ti...",0,0,0,0,1,0,2
1,ptbr_train_track_a_00002,Está e a nossa deputada Benedita linda guerrei...,0,0,0,1,0,0,3
2,ptbr_train_track_a_00003,só falta as roupas kkkkkkkkkkk,0,0,0,1,0,0,0
3,ptbr_train_track_a_00004,Eu tmb. Comecei a sair de casa agora (fui pela...,0,0,0,0,1,0,3
4,ptbr_train_track_a_00005,Peço a Deus que nossos dirigentes tenham realm...,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...
2221,ptbr_train_track_a_02222,Eu acho que o CAP vai surpreender hein.,0,0,0,0,0,1,2
2222,ptbr_train_track_a_02223,23:59 - Lula sabia de toda a corrupção no seu ...,1,0,0,0,0,0,1
2223,ptbr_train_track_a_02224,O Brasil precisa URGENTE de pessoas sérias e c...,1,0,0,0,0,0,1
2224,ptbr_train_track_a_02225,Sera que só eu acho que ta passando da hora de...,1,0,0,0,0,0,0


In [92]:
track_a.to_csv('validation/data/folds_a/stratify.csv', index= False)
track_b.to_csv('validation/data/folds_b/stratify.csv', index= False)