## Binary Datasets

In [1]:
import pandas as pd
from colorama import Fore, Style

In [2]:
# Funzione per binarizzare un singolo Dataframe
def binarize_single_df(df):

    # Conto quanti elementi hanno categoria not_cyberbullying e calcolo quanti elementi devo selezionare per categoria,
    # per ottenere un risultato bilanciato
    count_negative = (df['cyberbullying_type'].value_counts())['not_cyberbullying']
    count_positive_single = count_negative // 4

    # Seleziono solo i primi 'count_positive_single' elementi per ogni categoria
    print(Fore.BLUE + Style.BRIGHT + f"\n Inizialmente sono presenti {count_negative} elementi negativi e {len(df) - count_negative} elementi positivi." + Style.RESET_ALL)
    df_religion = df[df['cyberbullying_type'] == 'religion'].head(count_positive_single)
    df_age = df[df['cyberbullying_type'] == 'age'].head(count_positive_single)
    df_ethnicity = df[df['cyberbullying_type'] == 'ethnicity'].head(count_positive_single)
    df_gender = df[df['cyberbullying_type'] == 'gender'].head(count_positive_single)
    
    df_negative = df[df['cyberbullying_type'] == 'not_cyberbullying']
    
    # Uniamo i due Dataframe filtrati
    final_df = pd.concat([df_negative, df_religion, df_age, df_ethnicity, df_gender])

    # Applica la funzione ai dataset di train ed evaluation
    final_df['cyberbullying_type'] = final_df['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)

    return final_df

# Funzione per binarizzare tutti e 3 i Dataframe
def binarize_dfs(train_df_url, eval_df_url, test_df_url):
    train_df = pd.read_csv(train_df_url)
    eval_df = pd.read_csv(eval_df_url)
    test_df = pd.read_csv(test_df_url)

    print("Train set: ", end=""); train_set_binarized = binarize_single_df(train_df)
    print("Evaluation set: ", end=""); evaluation_set_binarized = binarize_single_df(eval_df)
    print("Test set: ", end=""); test_set_binarized = binarize_single_df(test_df)

    file_name_suff = "_binary.csv"
    train_set_binarized.to_csv(train_df_url.split(".csv")[0] + file_name_suff, index=False)
    evaluation_set_binarized.to_csv(eval_df_url.split(".csv")[0] + file_name_suff, index=False)
    test_set_binarized.to_csv(test_df_url.split(".csv")[0] + file_name_suff, index=False)
    
    return train_set_binarized, evaluation_set_binarized, test_set_binarized

In [3]:
train_set_binarized_BERT, evaluation_set_binarized_BERT, test_set_binarized_BERT = binarize_dfs('../../data/New dataset/BERT/train_tweets_Transformers_new.csv',
                                                                                                '../../data/New dataset/BERT/eval_tweets_Transformers_new.csv',
                                                                                                '../../data/New dataset/BERT/test_tweets_Transformers_new.csv')

Train set: [34m[1m
 Inizialmente sono presenti 4952 elementi negativi e 20203 elementi positivi.[0m
Evaluation set: [34m[1m
 Inizialmente sono presenti 1246 elementi negativi e 5079 elementi positivi.[0m
Test set: [34m[1m
 Inizialmente sono presenti 1338 elementi negativi e 6181 elementi positivi.[0m


In [4]:
train_set_binarized_LSTM, evaluation_set_binarized_LSTM, test_set_binarized_LSTM = binarize_dfs('../../data/New dataset/LSTM/preprocessing/train_tweets_LSTM_pre_new.csv',
                                                                                                '../../data/New dataset/LSTM/preprocessing/eval_tweets_LSTM_pre_new.csv',
                                                                                                '../../data/New dataset/LSTM/preprocessing/test_tweets_LSTM_p_new.csv')

Train set: [34m[1m
 Inizialmente sono presenti 4162 elementi negativi e 19627 elementi positivi.[0m
Evaluation set: [34m[1m
 Inizialmente sono presenti 1066 elementi negativi e 4962 elementi positivi.[0m
Test set: [34m[1m
 Inizialmente sono presenti 1338 elementi negativi e 6181 elementi positivi.[0m


In [5]:
train_set_binarized_NB, evaluation_set_binarized_NB, test_set_binarized_NB = binarize_dfs('../../data/New dataset/Naive Bayes/train_tweets_Naive_Bayes_new.csv',
                                                                                          '../../data/New dataset/Naive Bayes/eval_tweets_Naive_Bayes_new.csv',
                                                                                          '../../data/New dataset/Naive Bayes/test_tweets_Naive_Bayes_new.csv')

Train set: [34m[1m
 Inizialmente sono presenti 4319 elementi negativi e 19846 elementi positivi.[0m
Evaluation set: [34m[1m
 Inizialmente sono presenti 1101 elementi negativi e 5000 elementi positivi.[0m
Test set: [34m[1m
 Inizialmente sono presenti 1338 elementi negativi e 6181 elementi positivi.[0m


### Test set

In [6]:
print(test_set_binarized_BERT['cyberbullying_type'].value_counts())
print(test_set_binarized_LSTM['cyberbullying_type'].value_counts())
print(test_set_binarized_NB['cyberbullying_type'].value_counts())

cyberbullying_type
0    1338
1    1336
Name: count, dtype: int64
cyberbullying_type
0    1338
1    1336
Name: count, dtype: int64
cyberbullying_type
0    1338
1    1336
Name: count, dtype: int64


In [7]:
test_set_binarized_BERT.head()

Unnamed: 0,tweet_text,cyberbullying_type
2,it's nikki's hot pot but she has katie make th...,0
3,omg we still have 15 minutes,0
4,haters always will be haters.. don't care abou...,0
5,"akhir2 ini disekolah jadi serba salah, di bull...",0
18,smh her mother sent her to school with unfinis...,0


In [8]:
test_set_binarized_LSTM.head()

Unnamed: 0,tweet_text,cyberbullying_type
2,nick s hot pot hate make,0
3,om still 15 minutes,0
4,always care live moment happy baby never forget,0
5,awhirl in jade sera salad salah i di bully mul...,0
18,sh mother sent school unfinished micro,0


In [9]:
test_set_binarized_NB.head()

Unnamed: 0,tweet_text,cyberbullying_type
2,nikki s hot pot katie make sausage,0
3,omg still 15 minute,0
4,hater always hater care live moment happy baby...,0
5,akhir2 ini disekolah jadi serba salah di bully...,0
18,smh mother send school unfinished micro braid,0


### Train set

In [10]:
print(train_set_binarized_BERT['cyberbullying_type'].value_counts())
print(train_set_binarized_LSTM['cyberbullying_type'].value_counts())
print(train_set_binarized_NB['cyberbullying_type'].value_counts())

cyberbullying_type
0    4952
1    4952
Name: count, dtype: int64
cyberbullying_type
0    4162
1    4160
Name: count, dtype: int64
cyberbullying_type
0    4319
1    4316
Name: count, dtype: int64


In [11]:
train_set_binarized_BERT.head()

Unnamed: 0,tweet_text,cyberbullying_type
3,nah dont need explain hope ur well how r u,0
4,hahahaha,0
5,kkkkkkkkkkk eu e a mamae zombando com o que a ...,0
11,which was my first choice.,0
14,'bezza?' 'omg this is bezza' 'becca?' this is ...,0


In [12]:
train_set_binarized_LSTM.head()

Unnamed: 0,tweet_text,cyberbullying_type
3,need explain hope well r,0
4,ka e eu e mama commando come due muller fall b...,0
12,pizza om pizza mecca om,0
25,would pay money see feline mother,0
31,yep plenty barney council anything else honest,0


In [13]:
train_set_binarized_NB.head()

Unnamed: 0,tweet_text,cyberbullying_type
3,nah need explain hope well r,0
4,kk eu e mamae zombando com que mulher fala pra...,0
12,bezza omg bezza becca omg bully,0
25,would pay money see celine mother eliminate,0
31,yep plenty clichs barnet council expect anythi...,0


### Evaluation set

In [14]:
print(evaluation_set_binarized_BERT['cyberbullying_type'].value_counts())
print(evaluation_set_binarized_LSTM['cyberbullying_type'].value_counts())
print(evaluation_set_binarized_NB['cyberbullying_type'].value_counts())

cyberbullying_type
0    1246
1    1244
Name: count, dtype: int64
cyberbullying_type
0    1066
1    1064
Name: count, dtype: int64
cyberbullying_type
0    1101
1    1100
Name: count, dtype: int64


In [15]:
evaluation_set_binarized_BERT.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"hey, do you have a good way to consume multipl...",0
1,argg need sleep xx fuckin school today and foc...,0
6,now these are the texts i wanna see gt;gt;gt;g...,0
7,mkr maybe dan ; dennis will have a cook-off to...,0
11,~10% of the posts i've read on facebook today ...,0


In [16]:
evaluation_set_binarized_LSTM.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,hey good way consume multiple son dumps single...,0
1,are need sleep ex xx fucking school today focu...,0
5,want see it it it it it it it it it it it,0
6,maybe can dan tennis cookout see crisis divorce,0
10,10 read casebook today people looking work tho...,0


In [17]:
evaluation_set_binarized_NB.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,hey good way consume multiple json dump single...,0
1,argg need sleep xx fuckin school today focus d...,0
5,text want see gt gt gt gt gt gt gt gt gt gt gt,0
6,maybe dan dennis cookoff see win midlife crisi...,0
10,10 post read facebook today people look work j...,0
