In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.read_csv('../../data/tweets.csv')

# 1. Duplicates analysis

Check duplicated rows

In [35]:
print(f"I duplicati sono {df.duplicated().sum()}") 

I duplicati sono 36


In [36]:
# Duplicated with equal tweet_text and equal cyberbullying_type
df[df.duplicated(keep=False)]

Unnamed: 0,tweet_text,cyberbullying_type
829,Our pancakes are selling like hotcakes Shaz - ...,not_cyberbullying
1712,This is the opportunity to prove ourselves lik...,not_cyberbullying
1758,Our pancakes are selling like hotcakes Shaz - ...,not_cyberbullying
1984,@TVWEEKmag: There is only 1 way to stay in the...,not_cyberbullying
2611,It wouldn't be fair. Kat knows NOTHING of fair...,not_cyberbullying
...,...,...
20604,A Pakistani court has sentenced 86 members of ...,religion
41403,"Still, Davis, who is gay, said he pays a socia...",ethnicity
46915,Racism won't stop as long as u stil select ur ...,ethnicity
46962,"Still, Davis, who is gay, said he pays a socia...",ethnicity


### Now, we also observe tweets that are identical in tweet text but differ in cyberbullying type

In [37]:
df.duplicated(subset=['tweet_text']).sum()

1675

In [38]:
d = df[df.duplicated(subset=['tweet_text'], keep = False)]      
len(d)

3350

At this point, we have found 1639 pairs of duplicates on tweet_text. Let's observe how they are divided

In [39]:
len(d[d['cyberbullying_type']=='religion'])

8

In [40]:
len(d[d['cyberbullying_type']=='ethnicity'])

11

In [41]:
len(d[d['cyberbullying_type']=='age'])

0

In [42]:
len(d[d['cyberbullying_type']=='gender'])

226

In [43]:
len(d[d['cyberbullying_type']=='not_cyberbullying'])

1525

In [44]:
len(d[d['cyberbullying_type']=='other_cyberbullying'])

1580

Now, we ask ourselves: how many of these pair are composed of 'other_cyberbullying' and 'not_cyberbullying' as types?

In [45]:
dataset = d[(d['cyberbullying_type'] == 'other_cyberbullying') | (d['cyberbullying_type'] == 'not_cyberbullying')]
dataset.duplicated(subset=['tweet_text']).sum()

1464

How can be seen, most part of duplicated tweets are populated from the "other_cyberbullying" tweets, then we must drop them

In [46]:
def drop_duplicates(df):
    #Drop duplicates and other_cyberbullying tweets
    df = df.drop_duplicates()
    df = df[df['cyberbullying_type'] != 'other_cyberbullying']
    return df

In [47]:
drop_duplicates(df)
df.to_csv(r"../../data/updated_tweets.csv", index=False)

# 2. Datasplit

In [48]:

def split_dataset(input_file, train_file, eval_file, test_file, train_size=0.8, eval_size=0.1, random_state=None):
    # Carica il dataset
    df = pd.read_csv(input_file)
    
    # Primo split per ottenere il training set e il test set
    train_df, test_df = train_test_split(df, train_size=train_size, random_state=random_state)
    
    # Secondo split per ottenere il validation set dal training set
    eval_df_size = eval_size / train_size  # Proporzione del set di valutazione rispetto al training set
    train_df, eval_df = train_test_split(train_df, train_size=1 - eval_df_size, random_state=random_state)
    
    # Salva i dataframe nei rispettivi file
    train_df.to_csv(train_file, index=False)
    eval_df.to_csv(eval_file, index=False)
    test_df.to_csv(test_file, index=False)
    
    print(f"Training set salvato in: {train_file}")
    print(f"Evaluation set salvato in: {eval_file}")
    print(f"Test set salvato in: {test_file}")

In [49]:

# Utilizzo della funzione
split_dataset('../../data/updated_tweets.csv', '../../data/train_tweets.csv', '../../data/eval_tweets.csv', '../../data/test_tweets.csv')

Training set salvato in: ../../data/train_tweets.csv
Evaluation set salvato in: ../../data/eval_tweets.csv
Test set salvato in: ../../data/test_tweets.csv
