In [46]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from datasets import load_dataset
import pandas as pd
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re





In [47]:
#dowlnoad the dataset
data_load = load_dataset("nedjmaou/MLMA_hate_speech")
data_train = data_load["train"]
dataset = data_train.to_pandas()
print(dataset.head())
print(dataset.shape)

   HITId                                              tweet       sentiment  \
0      1  صلاة الفجر خير لك من ترديد بول البعير وسبي الن...  hateful_normal   
1      2  صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...       offensive   
2      3  طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...       offensive   
3      4  @user @user انا اوافقك بخصوص السوريين و العراق...          normal   
4      5  هذه السعودية التي شعبها شعب الخيم و بول البعير...          normal   

  directness                           annotator_sentiment  target       group  
0   indirect                                         shock  gender  individual  
1   indirect  anger_confusion_sadness_indifference_disgust   other       other  
2   indirect                                  indifference   other  individual  
3     direct                                  indifference  origin       other  
4   indirect                                  indifference  origin       other  
(18661, 7)


In [48]:
#dataset separate by languages
arab_path = "C:/Users/masmoudi/deeplearning/dataset/ar_dataset_600.csv"
ar_dataset = pd.read_csv(arab_path)
#print(ar_dataset.shape)
#print(ar_dataset.head())
ar_dataset = ar_dataset.sample(frac=1, random_state=42)



en_path="C:/Users/masmoudi/deeplearning/dataset/en_dataset_600.csv"
en_dataset = pd.read_csv(en_path)
#print(en_dataset.shape)
#print(en_dataset.head())
en_dataset = en_dataset.sample(frac=1, random_state=42)


fr_path="C:/Users/masmoudi/deeplearning/dataset/fr_dataset_600.csv"
fr_dataset = pd.read_csv(fr_path)
#print(fr_dataset.shape)
#print(fr_dataset.head())
fr_dataset = fr_dataset.sample(frac=1, random_state=42)

#create binary datasets
en_dataset_binary=en_dataset.copy()
en_dataset_binary = en_dataset_binary[en_dataset_binary['sentiment_binary'].isin([1, 3])]
en_dataset_binary['sentiment_binary'] = en_dataset_binary['sentiment_binary'].replace(2, 0)
en_dataset_binary['sentiment_binary'] = en_dataset_binary['sentiment_binary'].replace(3, 1)

ar_dataset_binary=ar_dataset.copy()
ar_dataset_binary = ar_dataset_binary[ar_dataset_binary['sentiment_binary'].isin([1, 3])]
ar_dataset_binary['sentiment_binary'] = ar_dataset_binary['sentiment_binary'].replace(2, 0)
ar_dataset_binary['sentiment_binary'] = ar_dataset_binary['sentiment_binary'].replace(3, 1)

fr_dataset_binary=fr_dataset.copy()
fr_dataset_binary = fr_dataset_binary[fr_dataset_binary['sentiment_binary'].isin([1, 3])]
fr_dataset_binary['sentiment_binary'] = fr_dataset_binary['sentiment_binary'].replace(2, 0)
fr_dataset_binary['sentiment_binary'] = fr_dataset_binary['sentiment_binary'].replace(3, 1)


In [49]:
#dataset of arabic and french tweets translated to english
arab_tr_path = "C:/Users/masmoudi/deeplearning/dataset/ar_dataset_600_translated.csv"
ar_tr_dataset = pd.read_csv(arab_tr_path)
print(ar_tr_dataset.shape)
print(ar_tr_dataset.head())
ar_tr_dataset = ar_tr_dataset.sample(frac=1, random_state=42)


fr_tr_path="C:/Users/masmoudi/deeplearning/dataset/fr_dataset_600_translated.csv"
fr_tr_dataset = pd.read_csv(fr_tr_path)
print(fr_tr_dataset.shape)
print(fr_tr_dataset.head())
fr_tr_dataset = fr_tr_dataset.sample(frac=1, random_state=42)

#create binary datasets
ar_tr_dataset_binary=ar_tr_dataset.copy()
ar_tr_dataset_binary = ar_tr_dataset_binary[ar_tr_dataset_binary['sentiment_binary'].isin([1, 3])]
ar_tr_dataset_binary['sentiment_binary'] = ar_tr_dataset_binary['sentiment_binary'].replace(2, 0)
ar_tr_dataset_binary['sentiment_binary'] = ar_tr_dataset_binary['sentiment_binary'].replace(3, 1)

fr_tr_dataset_binary=fr_tr_dataset.copy()
fr_tr_dataset_binary = fr_tr_dataset_binary[fr_tr_dataset_binary['sentiment_binary'].isin([1, 3])]
fr_tr_dataset_binary['sentiment_binary'] = fr_tr_dataset_binary['sentiment_binary'].replace(2, 0)
fr_tr_dataset_binary['sentiment_binary'] = fr_tr_dataset_binary['sentiment_binary'].replace(3, 1)

(600, 9)
   Unnamed: 0  HITId sentiment directness   annotator_sentiment    target  \
0           0   1486    normal   indirect  indifference_disgust     other   
1           1   3194    normal   indirect          indifference  religion   
2           2   1384    normal   indirect                 shock    origin   
3           3   1391    normal   indirect          indifference     other   
4           4   1420    normal     direct               sadness     other   

   group  sentiment_binary                                              tweet  
0  other                 1  The urine of the camel and the treatment of ca...  
1  other                 1  Islam deals with issues before it occurs, and ...  
2  other                 1  @user @user is not a question .. white object ...  
3  other                 1  Lecture of Harassment and extortion, Faculty o...  
4  women                 1  A fierce war on the mirror and its veil from e...  
(600, 9)
   Unnamed: 0  HITId sentiment directne

In [52]:
#prompts
default_prompt = "Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Text:"
default_prompt_en_hate_speech_def = "The definition of hate speech is ‘Hate speech’ is speech that attacks a person or group on the basis of attributes such as race, religion, ethnic origin, national origin, sex, disability, sexual orientation, or gender identity. Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Text:"
default_prompt_fr_hate_speech_def = "La définition du discours de haine est la suivante : le « discours de haine » est un discours qui attaque une personne ou un groupe en raison d’attributs tels que la race, la religion, l'origine ethnique, l'origine nationale, le sexe, le handicap, l'orientation sexuelle ou l'identité de genre. Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Text:"
default_prompt_ar_hate_speech_def = "تعريف خطاب الكراهية هو: 'خطاب الكراهية' هو خطاب يهاجم شخصًا أو مجموعة بناءً على سمات مثل العرق، الدين، الأصل العرقي، الأصل الوطني، الجنس، الإعاقة، التوجه الجنسي، أو الهوية الجندرية. Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Text:"
default_prompt_hate_speech_def_3_languages = "The definition of hate speech is ‘Hate speech’ is speech that attacks a person or group on the basis of attributes such as race, religion, ethnic origin, national origin, sex, disability, sexual orientation, or gender identity. تعريف خطاب الكراهية هو: 'خطاب الكراهية' هو خطاب يهاجم شخصًا أو مجموعة بناءً على سمات مثل العرق، الدين، الأصل العرقي، الأصل الوطني، الجنس، الإعاقة، التوجه الجنسي، أو الهوية الجندرية. La définition du discours de haine est la suivante : le « discours de haine » est un discours qui attaque une personne ou un groupe en raison d’attributs tels que la race, la religion, l'origine ethnique, l'origine nationale, le sexe, le handicap, l'orientation sexuelle ou l'identité de genre. Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Text:"
few_shot_en_examples = "Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Example 1:  Text: 'i dont think im getting my baby them white 9 he has two white j and nikes not even touched'. Answer: No. Example 2: Text: 'we cannot continue calling ourselves feminists if the right of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum'. Answer: No. Example 3: Text: 'tariq nasheed beating the fuck out of that drum beloved lmao somebody put a biscuit in that niggas mouth already'. Answer: No. Example 3: Text: 'tariq nasheed beating the fuck out of that drum beloved lmao somebody put a biscuit in that niggas mouth already'. Answer: No. Example 4: Text: 'cotton thigh highs to avoid contact with any gay chemicals'. Answer: No. Example 5: Text: 'fellas be careful she texting other niggas'. Answer: No. Example 6: Text: 'laura loomer raped me while screaming at me in her disgusting kike language and said we must exterminate the goyim laura loomer loomergate'. Answer: Yes. Example 7: Text: 'lol what a stupid kike'. Answer: Yes. Example 8: Text: 'this sounds like a kike national socialist'. Answer: Yes. Example 9: Text: 'just another dumb nigger'. Answer: Yes. Example 10: Text: 'they are going to figure out a way to kike bitcoin'. Answer: Yes. Text:"
few_shot_en_fr_ar_examples = "Is the following text hateful or not? Just answer in Yes or No. Don't provide explanations. Example 1:  Text: 'i dont think im getting my baby them white 9 he has two white j and nikes not even touched'. Answer: No. Example 2: Text: 'we cannot continue calling ourselves feminists if the right of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum'. Answer: No. Example 3: Text: 'just another dumb nigger'. Answer: Yes. Example 4: Text: 'they are going to figure out a way to kike bitcoin'. Answer: Yes. Example 5: Text: 'يارب انصرنا #افريقيا_يا_اهلي'. Example 6: Text: 'الصور العارية واجسام البنات ومقاطع منحرفة ، ستشهد عليها الاصابع يوم القيامة ، كفاكم استهانة قالجنة اغلى'. Answer: No. Example 7: 'بس يا شارب بول البعير يا عبد ال سلول'. Answer: Yes. Example 8: Text: 'هههعععععع اي علوم ههههه بول البعير'. Answer: Yes. Example 9: Text: 'ohhh la violence je meuuurs'. Answer: No. Example 10: Text: 'pour ou contre l'avortement ?'. Answer: No. Example 11: Text: 'tu nous ramène une maladie de jérusalem?'. Anwer: Yes. Example 12: Text: 'le secours catholique ardent promoteur de l'ensauvagement et de l'islamisation de l'europe ! son catholicisme dévo…' Answer: Yes. Text:"

In [53]:
#run with specific prompt and different datasets
#we run the following:
#default prompt with en, fr, ar, translated fr and translated ar datasets
#few-shot (only english examples) with en, fr and ar datasets
#few-shot (examples in all languages) with en, fr and ar datasets
used_dataset = en_dataset #define the dataset to use
used_data = en_dataset_binary
used_prompt = default_prompt #define the prompt used
predictions_concat = []

# Wrap range with tqdm
for i in tqdm(range(used_data.shape[0])):
    tweet = used_dataset['tweet'].iloc[i]

    #default prompt
    prompt = [
        {"role": "system", "content": used_prompt},
        {"role": "user", "content": tweet}
    ]

    # LLM initialization
    llm = Ollama(model="llama3")
    response = llm.invoke(prompt)
    
    if response.strip().lower() == 'yes':
        predictions = 1
    elif response.strip().lower() == 'no':
        predictions = 0
    else:
        predictions = 0  # Default to non-hateful if the response is unexpected
    
    predictions_concat.append(predictions)

print(len(predictions_concat))

# Add the predictions to the dataset
used_data['predictions'] = predictions_concat

# Metrics
acc = accuracy_score(used_data['sentiment_binary'], used_data['predictions'])
F1 = f1_score(used_data['sentiment_binary'], used_data['predictions'], average='macro')
Precision = precision_score(used_data['sentiment_binary'], used_data['predictions'], average='macro')
Recall = recall_score(used_data['sentiment_binary'], used_data['predictions'], average='macro')

print(f"Accuracy: {acc:.3f}, F1: {F1:.3f}, Precision: {Precision:.3f}, Recall: {Recall:.3f}")

  2%|▎         | 10/400 [00:22<14:46,  2.27s/it]


IndexError: single positional indexer is out-of-bounds