In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [2]:
from Preprocess.data_cleaning import DataCleaning


## 1. Data exploration

In [3]:
df1 = pd.read_csv('C:/Users/iyadh/Desktop/hate_speech/Data/labeled_data.csv')
df2 = pd.read_csv('C:/Users/iyadh/Desktop/hate_speech/Data/Ethos_Dataset_Binary.csv', sep=';')


In [4]:
df1.head(4)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...


In [5]:
df1.shape

(24783, 7)

In [6]:
df1.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [7]:
df1 = df1.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])

In [8]:
df1.head(4)

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...


In [9]:
df2.head(4)

Unnamed: 0,comment,isHate
0,You should know women's sports are a joke,1.0
1,You look like Sloth with deeper Down’s syndrome,1.0
2,You look like Russian and speak like Indian. B...,1.0
3,"Women deserve to be abused, I guess.",1.0


In [10]:
# df2 = df2.rename(columns={'comment': 'tweet', 'isHate': 'class'})

In [11]:
# Check class value counts (1 hate true , 2 neutral ,0 non hate)
#Were going to work with 1 and 0
df1['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [12]:
#Take only 1 and 0
df1 = df1[df1['class'].isin([0, 1])]

In [13]:
#Check results
df1['class'].value_counts()

class
1    19190
0     1430
Name: count, dtype: int64

In [14]:
#Rename twwet to comment
df1 = df1.rename(columns={'tweet': 'comment'})


In [15]:
#Check df2 hate value counts()
#As we can see its not cleaned sowergoing to clean it
df2['isHate'].value_counts()

isHate
0.000000    354
1.000000    163
0.166667    106
0.833333    100
0.333333     80
0.500000     74
0.666667     70
0.250000      6
0.750000      6
0.857143      3
0.903846      2
0.400000      2
0.018868      2
0.966667      1
0.954545      1
0.973333      1
0.978261      1
0.983871      1
0.983607      1
0.945455      1
0.937500      1
0.678571      1
0.821429      1
0.603448      1
0.722222      1
0.846154      1
0.849057      1
0.296875      1
0.302326      1
0.530612      1
0.200000      1
0.111111      1
0.103448      1
0.160714      1
0.152542      1
0.038961      1
0.090909      1
0.037736      1
0.031746      1
0.029851      1
0.030303      1
0.026316      1
0.016393      1
Name: count, dtype: int64

In [16]:
#Apply clean_class_column  
df2 = DataCleaning.clean_class_column(df2, 'isHate')


In [17]:
# Display clean_class_column result
df2['class'].value_counts()

class
0                 565
1                 358
not considered     75
Name: count, dtype: int64

In [18]:
#Drop is hate because we have class column 
df2 = df2.drop(columns=['isHate'])

In [19]:
df2

Unnamed: 0,comment,class
0,You should know women's sports are a joke,1
1,You look like Sloth with deeper Down’s syndrome,1
2,You look like Russian and speak like Indian. B...,1
3,"Women deserve to be abused, I guess.",1
4,Women are made for making babies and cooking d...,1
...,...,...
993,From the midnight sun where the hot springs blow,0
994,Don't say I'm not your type,0
995,And therefore never send to know for whom the...,0
996,And I can't stand another day,0


In [20]:
df = pd.concat([df1, df2], ignore_index=True)


In [21]:
df.shape

(21618, 2)

In [22]:
# Display target value counts
df['class'].value_counts()

class
1                 19548
0                  1995
not considered       75
Name: count, dtype: int64

In [23]:
# Nombre de valeurs NaN dans la colonne 'comment'
nan_number = df['comment'].isna().sum()
print("Number of NaN values in the column 'comment' :", nan_number)

Number of NaN values in the column 'comment' : 0


In [24]:
df["class"].value_counts(normalize=True)

class
1                 0.904246
0                 0.092284
not considered    0.003469
Name: proportion, dtype: float64

In [25]:
# On remarque que la class target is unbalanced so we must first treat this problem
# TO adress this problem were going to use : 
# - use T5 model
# - use PEGASUS paraphraser model 
# - use GPT-2 model 

# - use smote before training (when splitting data)

## 2. Data augmentation

In [26]:
# 
print(df['class'].value_counts())
minority_class = df['class'].value_counts().idxmin()  
minority_data = df[df['class'] == minority_class]['comment']  

class
1                 19548
0                  1995
not considered       75
Name: count, dtype: int64


In [38]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-large"  # Or "t5-base" or "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [80]:
def paraphrase_text(text, num_return_sequences=3):
    input_text = f"Rephrase this text: {text.lower()}"  # Convert the input text to lowercase
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=20,
        num_return_sequences=num_return_sequences,  # Generate multiple paraphrases
        do_sample=True,  # Enable sampling for diversity
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2  # Prevents repeating 2-word sequences
    )
    paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    # Convert the paraphrases to lowercase and remove the lowercase version of the prompt
    paraphrases = [paraphrase.replace("rephrase", "").strip().lower() for paraphrase in paraphrases]
    
    return paraphrases



In [110]:
# Test 
sample_text = "white people are shit human beings"
paraphrases = paraphrase_text(sample_text)
# print(f"Original: {sample_text}")
# print(f"Paraphrases: {paraphrases}")

In [111]:
print(f"Paraphrases: {paraphrases}")

Paraphrases: ['white people are shit.', 'white people are shit human beings.', 'white people are not human beings.']


In [84]:
#Display another example (Original text : i love this movie)
paraphrases[2]

'i love this movie.  : this film is awesome'

In [None]:
# peagsus paraphraser generation function
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [106]:
num_beams = 10
num_return_sequences = 10

In [109]:
response=get_response("white people are shit human beings",num_return_sequences,num_beams)
response

['White people are not human beings.',
 'White people are scum.',
 'White people are not human.',
 'White people are not good people.',
 'White people are stupid.',
 'White people are dumb.',
 'People of white colour are not human beings.',
 'White people are not nice.',
 'People who are white are scum.',
 'People who are white are not human beings.']

In [113]:
#Another example
response=get_response("i love this movie",num_return_sequences,num_beams)
response

['I like this movie.',
 'I really like this movie.',
 'I enjoy this movie.',
 'I like this movie a lot.',
 'I love this movie.',
 'I like this film.',
 'I like the movie.',
 'I love the movie.',
 'I think this movie is great.',
 'This is a movie I really like.']

In [None]:
# augmented_data = []
# for text in minority_data:
#     paraphrases = paraphrase_text(text, num_return_sequences=2)  # Generate 2 new samples per original
#     augmented_data.extend([(para, minority_class) for para in paraphrases])

# # Convert to dataframe
# augmented_df = pd.DataFrame(augmented_data, columns=['comment', 'label'])

In [145]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_text(text, max_length=20, num_samples=5):
    prompt = f"Parphrase this text: {text}"  # Clearer phrasing for paraphrasing
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate multiple samples
    outputs = model.generate(inputs["input_ids"], max_length=max_length, do_sample=True, top_k=50, num_return_sequences=num_samples)
    
    # Decode and return all the generated samples
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Example usage
text = "i love you"
generated_samples = generate_text(text)

# Print all generated samples
for i, sample in enumerate(generated_samples, 1):
    print(f"Sample {i}: {sample}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1: Parphrase this text: i love you!
Sample 2: Parphrase this text: i love you more than anything because I'm a pretty decent person


Sample 3: Parphrase this text: i love you so much but i dont want to talk anymore as i dont
Sample 4: Parphrase this text: i love you (or my wife) my heart desires your devotion to me
Sample 5: Parphrase this text: i love you lmao


Click to expand...


##### As we can see the models are generating some decent data especially Pegasus so we're going to mainly focus on data generated from Pegasus

##### Since we dont have cuda were going to run in colab and u can find the notebooks in folder colab notebooks

## 3. Data Perprocessing

##### After solving the unmatched data count we're going to preprocess it so we can feed it to the model