In [13]:
import numpy as np
import jsonlines
import random
import os 
import pandas as pd
import json
import torch
#from transformers import RobertaTokenizer, BertTokenizer, RobertaForSequenceClassification
##import pacakages for training
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from utils import preprocess_text, get_paraphrase_batch, get_paraphrase_dataset

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### load the data and first processing steps 

In [2]:
path_misogyny_dataset = 'online-misogyny-eacl2021/data/final_labels.csv'
path_sexism_dataset = '/scratch/izar/havolli/edos/data/edos_labelled_aggregated.csv'

#assert path exists 
assert os.path.exists(path_misogyny_dataset)
assert os.path.exists(path_sexism_dataset)

#### let's start with the misogyny dataset

In [3]:
#load the dataset
df = pd.read_csv(path_misogyny_dataset)

In [4]:
#preprocess steps 
df['body'] = df['body'].fillna('')
df['body'] = df['body'].astype(str)

In [5]:
#filtered columns
filtered_df = df[['entry_id', 'body', 'level_1', 'split']]
filtered_df.columns = ['entry_id', 'text', 'label', 'split']

filtered_df.head()

Unnamed: 0,entry_id,text,label,split
0,exoxn7,Do you have the skin of a 80 year old grandma?...,Nonmisogynistic,train
1,fgb3bdv,This is taking a grain of truth and extrapolat...,Nonmisogynistic,train
2,fgc6tlu,Honestly my favorite thing about this is that ...,Nonmisogynistic,test
3,fge6msg,Source? Doesnt sound right to me idk,Nonmisogynistic,test
4,fgawus5,"Damn, I saw a movie in which the old woman bat...",Misogynistic,train


In [6]:
train_data = filtered_df.to_json(orient='records', lines=True)

# Define the file path where you want to save the JSON data
path_json = '/scratch/izar/havolli/train_data.jsonl'

# Open a file at the specified path in write mode ('w')
with open(path_json, 'w') as file:
    file.write(train_data)

In [7]:
misogynistic_df = filtered_df[filtered_df['label'] == 'Misogynistic']
non_misogynistic_df = filtered_df[filtered_df['label'] == 'Nonmisogynistic']

misogynistic_df.head()

Unnamed: 0,entry_id,text,label,split
4,fgawus5,"Damn, I saw a movie in which the old woman bat...",Misogynistic,train
58,fgdhmbf,Okay but even if this wasn't a stupid hyperbol...,Misogynistic,train
59,fgdhmbf,Okay but even if this wasn't a stupid hyperbol...,Misogynistic,train
95,fgmifk2,> The problem is that they removed the urinals...,Misogynistic,train
96,fgmx3lv,But using the urinals in front of girls that a...,Misogynistic,train


In [19]:
misogynistic_df.shape

(699, 4)

In [8]:
non_misogynistic_df.head()

Unnamed: 0,entry_id,text,label,split
0,exoxn7,Do you have the skin of a 80 year old grandma?...,Nonmisogynistic,train
1,fgb3bdv,This is taking a grain of truth and extrapolat...,Nonmisogynistic,train
2,fgc6tlu,Honestly my favorite thing about this is that ...,Nonmisogynistic,test
3,fge6msg,Source? Doesnt sound right to me idk,Nonmisogynistic,test
5,fgctirr,It's a question of the sales pitch involved.\r...,Nonmisogynistic,train


In [9]:
##save the data as json files 
misogynistic_data = misogynistic_df.to_json(orient='records', lines=True)
non_misogynistic_data = non_misogynistic_df.to_json(orient='records', lines=True)

# Define the file path where you want to save the JSON data
miso_json = '/scratch/izar/havolli/misogynistic_data.jsonl'
non_miso_json = '/scratch/izar/havolli/non_misogynistic_data.jsonl'

# Open a file at the specified path in write mode ('w')
with open(miso_json, 'w') as file:
    file.write(misogynistic_data)

with open(non_miso_json, 'w') as file:
    file.write(non_misogynistic_data)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
# get the given pretrained paraphrase model and the corresponding tokenizer (https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base)
paraphrase_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

  return self.fget.__get__(instance, owner)()


### we will use paraphrasing to augmented our dataset (only in misogynistic cases)

In [14]:
#data_dir = 'data'
misogyny_path = os.path.join('/scratch/izar/havolli/misogynistic_data.jsonl')
BATCH_SIZE = 8
N_PARAPHRASE = 2

paraphrase_dataset = get_paraphrase_dataset(paraphrase_model, paraphrase_tokenizer, device, misogyny_path, BATCH_SIZE, N_PARAPHRASE)



In [16]:
len(paraphrase_dataset) #for each input misogynistic case (699 in total), we create 2 paraphrases --> 2*699=1398 

1398

In [17]:
#data_dir = 'data'
paraphrased_data_train_path = os.path.join('/scratch/izar/havolli/paraphrased_misogyny_data.jsonl')
with jsonlines.open(paraphrased_data_train_path, "w") as writer:
    writer.write_all(paraphrase_dataset)

In [20]:
# Original training dataset
with jsonlines.open(misogyny_path, "r") as reader:
    original_data = [dt for dt in reader.iter()]

In [21]:
len(original_data) #this is the original data for the misogynistic cases 

699

In [22]:
def write_to_jsonl(first_data, second_data, output_file_path):
    combined_data = first_data + second_data
    
    #shuffle the data 
    random.shuffle(combined_data)
    
    #open a jsonlines file for writing
    with jsonlines.open(output_file_path, mode='w') as writer:
        #write the shuffled data
        for entry in combined_data:
            writer.write(entry)

In [23]:
#this file will contain the augmented dataset for only misogynistic cases 
output_file_path = '/scratch/izar/havolli/augmented_misogynistic_data.jsonl'
write_to_jsonl(original_data, paraphrase_dataset, output_file_path)

In [26]:
#open misogynistic_augmented_data and non_misogynistic_data
with jsonlines.open('/scratch/izar/havolli/augmented_misogynistic_data.jsonl', "r") as reader:
    misogynistic_augmented_data = [dt for dt in reader.iter()]

with jsonlines.open(non_miso_json, "r") as reader:
    non_misogynistic_data = [dt for dt in reader.iter()]

In [27]:
len(misogynistic_augmented_data), len(non_misogynistic_data)

(2097, 5868)

In [30]:
output_file_path = '/scratch/izar/havolli/augmented_data.jsonl'
write_to_jsonl(misogynistic_augmented_data, non_misogynistic_data, output_file_path)

In [31]:
output_file_path = '/scratch/izar/havolli/augmented_data.jsonl'
with jsonlines.open(output_file_path, "r") as reader:
    augmented_data = [dt for dt in reader.iter()]

len(augmented_data)

7965

### finally create a validation set

In [32]:
df = pd.read_json(output_file_path, lines=True)

In [33]:
# Filter to get only the training rows
train_df = df[df['split'] == 'train']

# Sample 10% of the training dataset
validation_set = train_df.sample(frac=0.1, random_state=42)  # random_state for reproducibility

# Change 'split' to 'validation' for these sampled rows
validation_set['split'] = 'dev'

# Update the original DataFrame
df.update(validation_set)

In [34]:
df['split'].unique()

array(['train', 'test', 'dev'], dtype=object)

In [36]:
# Save the modified DataFrame back to a JSONL file
df.to_json('/scratch/izar/havolli/augmented_data.jsonl', orient='records', lines=True)

In [37]:
df.shape

(7965, 4)

## dataset sexism for pretraining 

In [103]:
df_sexism = pd.read_csv(path_sexism_dataset)

df_sexism.head()

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train
3,sexism2022_english-13021,woman?,not sexist,none,none,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev


In [104]:
df_sexism.shape

(20000, 6)

In [109]:
#preprocess steps 
df_sexism['text'] = df_sexism['text'].fillna('')
df_sexism['text'] = df_sexism['text'].astype(str)

In [106]:
#unique values of df_sexism['split']
df_sexism['split'].unique()

array(['dev', 'train', 'test'], dtype=object)

In [110]:
df_filtered_sexism = df_sexism[['rewire_id', 'text', 'label_sexist', 'split']]
filtered_sexism.columns = ['entry_id', 'text', 'label', 'split']

In [111]:
sexism_jsonl = df_filtered_sexism.to_json(orient='records', lines=True)

# Define the file path where you want to save the JSON data
path_jsonl = 'sexism_data.jsonl'

# Open a file at the specified path in write mode ('w')
with open(path_jsonl, 'w') as file:
    file.write(sexism_jsonl)