In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import os 
import sys 
import warnings 
import random
from pprint import pprint as pp
from dotenv import load_dotenv
import os
from huggingface_hub import whoami, HfFolder

import gc

import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader

from transformers import BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM 
from transformers import set_seed, Seq2SeqTrainer, LlamaTokenizer

from datasets import Dataset, DatasetDict

from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


## Data Stuff

In [2]:
import torch
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss, Softmax
from torch.utils.data.dataloader import DataLoader

from transformers import BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM 
from transformers import set_seed, Seq2SeqTrainer, LlamaTokenizer

from datasets import Dataset, DatasetDict

from peft import LoraConfig, get_peft_model

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


In [3]:
model_id = "Models/Qwen2.5-0.5B"

In [5]:
def log_hf():
    
    load_dotenv("env_vars.env")
    hf_token = os.environ.get("HF_ACCESS_TOKEN")
    HfFolder.save_token(hf_token)
    return print(whoami()["name"])

def setup():
    dist.init_process_group("nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    return local_rank

def save_results_csv(df, experiment_name, model_id, cl_technique, result_type="specific"):

    cl_technique_clean = cl_technique.replace(" + ", "__")
    id_ = model_id.replace("/", "-") + "_" + cl_technique_clean + "_" + str(date.today())
    id_clean = experiment_name + id_.replace(" ", "_").replace(":", "-").replace(".","-") + result_type + ".csv"
    df.to_csv(id_clean, index=False)
    return print("Saved in path: ", id_clean)

def clean_cl_name(cl_name):

    regex = r'<(?:[\w\.]+)?\.([\w]+) object at'
    matches =   re.findall(regex, cl_name)
    clean_string = " + ".join(matches)
    return clean_string

def clean_metric_name(metric_name):

    reg = r"\s([a-z_1]+)\s"
    match_ = re.search(reg, metric_name)
    clean_str = match_.group().strip()

    return clean_str

def translate_class_to_label(class_):

    translation_dict = {"not_hate": "NOT HATEFUL",
                        "explicit_hate": "HATEFUL",
                        "implicit_hate": "HATEFUL"}

    translated_label = translation_dict[class_]

    return translated_label

def format_message(formatted_prompt, label=True):
    if label:
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": formatted_prompt},
            {"role": "assistant", "content": label}
        ]
    else:
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": formatted_prompt}
        ]
    return messages

base_prompt = """You are a social media content moderator.
INSTRUCTION: The following is a social media message that needs to be classified with the label HATEFUL or NOT HATEFUL.
MESSAGE: {}
OUTPUT AND FORMAT: your output should be just the label."""

def format_prompt(text, base_prompt=base_prompt):

    formatted_prompt = base_prompt.format(text)
    
    return formatted_prompt

def get_probability_distribution(logits):
    probability_dist = Softmax(dim=-1)(logits)
    return probability_dist

loss_fn = CrossEntropyLoss(ignore_index=-100) # ignore the left pad tokens
def loss_f(logits, labels):

    flat_logits = logits.view(-1, logits.size(-1))
    flat_labels = labels.view(-1)

    loss = loss_fn(flat_logits, flat_labels)
    
    return loss

def translate_prediction_to_label(text):
    if "NOT HATEFUL" in text:
        text_clean = text.replace("NOT HATEFUL", "")
        if "HATEFUL" in text_clean or "HATEFUAL" in text_clean:
            return 2
        else:
            return 0
    elif "NOT_HATEFUL" in text:
        text_clean = text.replace("NOT_HATEFUL", "")
        if "HATEFUL" in text_clean or "HATEFUAL" in text_clean:
            return 2
        else: 
            return 0
    else:
        return 1


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id + "/Tokenizer")
if tokenizer.pad_token is None and "Llama" in model_id: tokenizer.pad_token = '<|finetune_right_pad_id|>'
if tokenizer.pad_token is None and "Qwen" in model_id: tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = open(model_id + "/Tokenizer/chat_template.jinja").read()



def preprocess_and_tokenize(clean_post, label, base_prompt=base_prompt, max_length=512):
    
    prompt_plus_messages = base_prompt.format(clean_post)
    messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt_plus_messages},
            {"role": "assistant", "content": label.strip("\n")}
        ]

    chat_template = tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=False, add_special_tokens=False).rstrip()
    input_ids_tokenized = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding="max_length", max_length=max_length)["input_ids"]

    # getting the normal text just to know how much we need to add to the left as -100 and right as pad token
    input_ids_shape = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding=False)["input_ids"]

    # getting the label target to only predict the actual label and ignore the prompt
    labels_tokenized = tokenizer(label + tokenizer.eos_token, add_special_tokens=True, return_tensors="pt")["input_ids"]
    shape = input_ids_shape.shape[1] - labels_tokenized.shape[1]
    zeros = torch.zeros((1, shape), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    zeros.fill_(-100) # for the cross entropy loss
    labels_left_padded = torch.cat([zeros, labels_tokenized], dim=1)

    eos_n = input_ids_tokenized.shape[1] - labels_left_padded.shape[1]
    eos_n_tensor = torch.zeros((1, eos_n), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    eos_n_tensor.fill_(tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0])
    labels_padded = torch.cat([labels_left_padded, eos_n_tensor], dim=1)

    # print(labels_padded.shape == input_ids_tokenized.shape)

    # shifting because we dont predict the first token
    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenized_right_shifted = labels_padded[:, 1:]

    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    
    return {
        "input_ids": input_ids_tokenized_left_shifted,
        "labels": labels_tokenized_right_shifted,
        "attention_mask": attention_mask
    }


In [7]:
base_prompt = """You are a social media content moderator.
INSTRUCTION: The following is a social media message that needs to be classified with the label HATEFUL or NOT HATEFUL.
MESSAGE: {}
OUTPUT AND FORMAT: your output should be just the label."""


def format_prompt(text, base_prompt=base_prompt):

    formatted_prompt = base_prompt.format(text)
    
    return formatted_prompt

def translate_class_to_label(class_):

    translation_dict = {"not_hate": "NOT HATEFUL",
                        "explicit_hate": "HATEFUL",
                        "implicit_hate": "HATEFUL"}

    translated_label = translation_dict[class_]

    return translated_label


In [8]:
def setup():
    dist.init_process_group("nccl")
    local_rank = int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)
    return local_rank


In [9]:
dataset_path = "df_from_exp_to_imp.csv"

In [11]:
try:
    local_rank = setup()
except:
    local_rank = 1
device = torch.device(f"cuda:{local_rank}")
world_size = dist.get_world_size()


print("----------Preparing the Data-----------------")

print("_________________________________")
print("Loading and filtering the Data")

df = pd.read_csv(dataset_path)

#### Attaching the prompt to the clean post
df["formatted_prompt"] = df["clean_post"].apply(format_prompt)
df["label"] = df["class"].apply(translate_class_to_label)

# ### Turning the Df into a DatasetDict


times_array = list(df["time"].unique())
datasets = []
dataset_names = list(df["task"].unique())

for time in times_array:

    time_ds = []
    for split in df["split"].unique():

        split_df = df[(df["split"] == split) & (df["time"] == time)]
        hf_split = Dataset.from_pandas(split_df)
        time_ds.append(hf_split)
    datasets.append(time_ds)

hf_datasets = []

for i, dataset in enumerate(datasets):

    hf_ds = DatasetDict({dataset[0]["split"][0]: dataset[0], 
                        dataset[1]["split"][0]: dataset[1],
                        dataset[2]["split"][0]: dataset[2]})
    hf_ds_name = dataset_names[i]
    hf_datasets.append({hf_ds_name: hf_ds})

hf_datasets = [
    {task_name: hf_time.map(preprocess_and_tokenize, input_columns=["clean_post", "label"], batched=False)}
    for hf_data in hf_datasets
    for task_name, hf_time in hf_data.items() 
]

n_samples_per_ds = [
    len(hf_time["train"])
    for hf_data in hf_datasets
    for task_name, hf_time in hf_data.items() 
]

for ds in hf_datasets:
    for hf_data in ds.values():
        hf_data.set_format("torch")

cols_to_remove = ["clean_post", "post", "class", "implicit_class", "extra_implicit_class", 
                "target", "implied_statement", "split", "time", "task",
                "formatted_prompt", "label", "__index_level_0__"]

hf_datasets = [
    {task_name: {split: hf_time[split].remove_columns(cols_to_remove)}}
    for hf_data in hf_datasets
    for task_name, hf_time in hf_data.items()
    for split in hf_time 
    if split != "test"]

print("hf_datasets before data collator:")
print(hf_datasets)
print()
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# distributed_samplers = [
#     {task_name: {split: DistributedSampler(hf_time[split], num_replicas=world_size, rank=local_rank, shuffle=False)}}
#     for hf_data in hf_datasets
#     for task_name, hf_time in hf_data.items()
#     for split in hf_time 
#     if split != "test"
# ]

distributed_samplers = []
for ds in hf_datasets:
    ds_dict = {}
    print("ds:")
    print(ds)
    for task_name, hf_data in ds.items():
        print("task_name:")
        print(task_name)
        print("hf_data:")
        print(hf_data)
        ds_dict[task_name] = {}
        for split in hf_data:
            print("split:")
            print(split)
            if split != "test":
                distr_sampler = DistributedSampler(hf_data[split], num_replicas=world_size, rank=local_rank, shuffle=False)
                ds_dict[task_name][split] = distr_sampler
        dsitr_samplers.append(ds_dict)

data_loaders = []
for i, distr_sampler in enumerate(distributed_samplers):
    ds_name = list(distr_sampler.keys())[0]
    ds_dict = {}
    ds_dict[ds_name] = {}
    for split, distributed_sampler in distr_sampler[ds_name].items():
        data_loader = DataLoader(hf_datasets[i][ds_name][split], collate_fn=data_collator, batch_size=batch_size, sampler=distributed_sampler)
        ds_dict[ds_name][split] = data_loader
    data_loaders.append(ds_dict)



ValueError: Default process group has not been initialized, please make sure to call init_process_group.

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

distributed_samplers = [
    {task_name: {split: DistributedSampler(hf_time[split], num_replicas=world_size, rank=local_rank, shuffle=False)}}
    for hf_data in hf_datasets
    for task_name, hf_time in hf_data.items()
    for split in hf_time 
    if split != "test"
]

data_loaders = []
for i, distr_sampler in enumerate(distributed_samplers):
    ds_name = list(distr_sampler.keys())[0]
    ds_dict = {}
    ds_dict[ds_name] = {}
    for split, distributed_sampler in distr_sampler[ds_name].items():
        data_loader = DataLoader(hf_datasets[i][ds_name][split], collate_fn=data_collator, batch_size=batch_size, sampler=distributed_sampler)
        ds_dict[ds_name][split] = data_loader
    data_loaders.append(ds_dict)

# data loader = []
# each item in the list is a dictionary of {<dataset_name>: {<split>: <dataloade>}}



## Model Stuff

In [22]:
bnb_config = BitsAndBytesConfig(  
                                load_in_4bit= True,
                                bnb_4bit_quant_type= "nf4",
                                bnb_4bit_compute_dtype= torch.bfloat16,
                                bnb_4bit_use_double_quant= True,
                            )


In [27]:
model_id = "Models/Llama-3.2-1B-Instruct/Model"
tokenizer_id = "Models/Llama-3.2-1B-Instruct/Tokenizer"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

# model.save_pretrained("Models/Llama-3.2-1B-Instruct/Model")
# tokenizer.save_pretrained("Models/Llama-3.2-1B-Instruct/Tokenizer")

Some parameters are on the meta device because they were offloaded to the cpu.


In [3]:
warnings.filterwarnings("ignore") 
# log_hf()
load_dotenv("env_vars.env")

set_seed(42)
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)


In [4]:
batch_size = 8
n_epochs = 2
lr = 1e-5
lora_r = 8

In [5]:
########################################################## DATA WORK
print("_________________________________")
print("Preapring the Data")


df = pd.read_csv("df_from_exp_to_imp.csv")

base_prompt = """You are a social media content moderator.
INSTRUCTION: The following is a social media message that needs to be classified with the label HATEFUL or NOT HATEFUL.
MESSAGE: {}
OUTPUT AND FORMAT: your output should be just the label."""

_________________________________
Preapring the Data


In [6]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [7]:
# tokenizer.encode(tokenizer.pad_token)

In [8]:
print(tokenizer.mask_token)

None


In [9]:
print(tokenizer.pad_token)
if tokenizer.pad_token is None: tokenizer.pad_token = '<|finetune_right_pad_id|>'

None


In [10]:
print(tokenizer.pad_token)
print(type(tokenizer.encode(tokenizer.pad_token, add_special_tokens=False))[0])

<|finetune_right_pad_id|>
list[0]


In [11]:
tokenizer.encode(tokenizer.pad_token, add_special_tokens=True)[0]

128000

In [12]:
tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]

128004

In [13]:
tokens = tokenizer.encode("Hello, how are you?", return_tensors="pt")
print(tokens.shape)
second = tokens.fill_(-100)
tok = tokenizer.encode("Hello, how are you?", return_tensors="pt")

# torch_tensor = torch.tensor(tokens)


torch.Size([1, 7])


In [14]:
both = torch.cat((tokens,tok), dim=1)
both

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100, 128000,   9906,
             11,   1268,    527,    499,     30]])

In [15]:
second

tensor([[-100, -100, -100, -100, -100, -100, -100]])

In [16]:
# out = model(both)
# print(out)

In [17]:
tokenizer.pad_token

'<|finetune_right_pad_id|>'

In [18]:
def translate_class_to_label(class_):

    translation_dict = {"not_hate": "NOT HATEFUL",
                        "explicit_hate": "HATEFUL",
                        "implicit_hate": "HATEFUL"}

    translated_label = translation_dict[class_]

    return translated_label


In [20]:
def format_message(formatted_prompt, label=True):
    if label:
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": formatted_prompt},
            {"role": "assistant", "content": label}
        ]
    else:
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": formatted_prompt}
        ]
    return messages

def format_prompt(text, base_prompt=base_prompt):

    formatted_prompt = base_prompt.format(text)
    
    return formatted_prompt


In [21]:
def preprocess_and_tokenize(clean_post, label, base_prompt=base_prompt, max_length=312):

    # if type(label) != list:
    #     label = [label]
    # if type(clean_post) != list:
    #     clean_post = [clean_post]
    
    prompt_plus_messages = base_prompt.format(clean_post)
    # pp(prompt_plus_messages)
    # pp(label)
    messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt_plus_messages},
            {"role": "assistant", "content": label.strip("\n")}
        ]

    # print(messages)
    chat_template = tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=False, add_special_tokens=False).rstrip()
    # print(chat_template)

    # why is the chat template putting a new line at the end of the end of sequence
    # pp(chat_template)
    input_ids_tokenized = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding="max_length", max_length=max_length)["input_ids"]

    # getting the normal text just to know how much we need to add to the left as -100 and right as pad token
    input_ids_shape = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding=False)["input_ids"]
    # print(input_ids_tokenized)

    # getting the label target to only predict the actual label and ignore the prompt
    labels_tokenized = tokenizer(label + tokenizer.eos_token, add_special_tokens=True, return_tensors="pt")["input_ids"]
    shape = input_ids_shape.shape[1] - labels_tokenized.shape[1]
    zeros = torch.zeros((1, shape), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    zeros.fill_(-100) # acc to llama docs
    labels_left_padded = torch.cat([zeros, labels_tokenized], dim=1)

    eos_n = input_ids_tokenized.shape[1] - labels_left_padded.shape[1]
    eos_n_tensor = torch.zeros((1, eos_n), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    print("FILLING PAD WITH")
    print(tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0])
    eos_n_tensor.fill_(tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0])
    labels_padded = torch.cat([labels_left_padded, eos_n_tensor], dim=1)

    # print(labels_padded.shape == input_ids_tokenized.shape)

    # shifting because we dont predict the first token
    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenized_right_shifted = labels_padded[:, 1:]

    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    
    return {
        "input_ids": input_ids_tokenized_left_shifted,
        "labels": labels_tokenized_right_shifted,
        "attention_mask": attention_mask
    }


In [28]:
def loss_f(logits, labels):

    loss_fn = CrossEntropyLoss(reduce=False)
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    
    return loss


In [29]:
   

#### Attaching the prompt to the clean post

df["formatted_prompt"] = df["clean_post"].apply(format_prompt)
df["label"] = df["class"].apply(translate_class_to_label)

# ### Turning the Df into a DatasetDict

t_1 = []
t_2 = []

for split in df["split"].unique():

    split_df_1 = df[(df["split"] == split) & (df["time"] == 1)]
    split_df_2 = df[(df["split"] == split) & (df["time"] == 2)]

    hf_split_1 = Dataset.from_pandas(split_df_1)
    hf_split_2 = Dataset.from_pandas(split_df_2)
    
    t_1.append(hf_split_1)
    t_2.append(hf_split_2)

hf_time_1 = DatasetDict({t_1[0]["split"][0]: t_1[0], 
                        t_1[1]["split"][0]: t_1[1],
                        t_1[2]["split"][0]: t_1[2]})

hf_time_2 = DatasetDict({t_2[0]["split"][0]: t_2[0], 
                        t_2[1]["split"][0]: t_2[1],
                        t_2[2]["split"][0]: t_2[2]})




In [34]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
ex = hf_time_1["train"][0]
input_model = preprocess_and_tokenize(ex["clean_post"], ex["label"], base_prompt=base_prompt, max_length=312)
input_model["labels"]
input_model["input_ids"]

In [40]:
output_model = model(**input_model)

In [59]:
output_model.loss

tensor(9.3625, grad_fn=<ToCopyBackward0>)

In [44]:
output_model.logits.shape

torch.Size([1, 311, 128256])

In [46]:
input_model["labels"].shape

torch.Size([1, 311])

In [66]:
logits = output_model.logits
labels = input_model["labels"]

flat_logits = logits.view(-1, logits.size(-1))
flat_labels = labels.view(-1)

In [71]:
flat_logits.shape

torch.Size([311, 128256])

In [72]:
flat_labels.shape

torch.Size([311])

In [75]:
loss_fn = CrossEntropyLoss(ignore_index=-100)  
loss = loss_fn(flat_logits, flat_labels)          
loss

tensor(8.9375, dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

In [58]:
labels.view(-1).shape

torch.Size([311])

Checking that the -100 is not being computed!!

In [78]:
mask = (flat_labels != -100)
mask.sum().item()

207

In [79]:
valid_logits = flat_logits[mask]   
valid_labels = flat_labels[mask]        

In [81]:
manual_loss = loss_fn(valid_logits, valid_labels)
manual_loss

tensor(8.9375, dtype=torch.bfloat16, grad_fn=<NllLossBackward0>)

In [None]:
def loss_f(logits, labels):

    loss_fn = CrossEntropyLoss(reduce=False)
    loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
    
    return loss


In [11]:
########################################################## TOKENIZER WORK

hf_time_1 = hf_time_1.map(preprocess_and_tokenize, input_columns=["formatted_prompt", "label"], batched=False)
hf_time_2 = hf_time_2.map(preprocess_and_tokenize, input_columns=["formatted_prompt", "label"], batched=False)



Map: 100%|██████████| 718/718 [00:00<00:00, 1300.52 examples/s]
Map: 100%|██████████| 5752/5752 [00:04<00:00, 1307.97 examples/s]
Map: 100%|██████████| 720/720 [00:00<00:00, 1364.92 examples/s]
Map: 100%|██████████| 1019/1019 [00:00<00:00, 1268.26 examples/s]
Map: 100%|██████████| 8155/8155 [00:06<00:00, 1311.17 examples/s]
Map: 100%|██████████| 1020/1020 [00:00<00:00, 1156.90 examples/s]


In [27]:
hf_time_1["train"][0]["label"]

'NOT HATEFUL'

In [28]:
tokenizer.eos_token

'<|endoftext|>'

In [30]:
tokenizer.pad_token_id

151643

In [33]:
tokenizer.decode(151643)
end_of_text_token = 151643

In [41]:
hate_encoding = tokenizer.encode("HATEFUL")


In [42]:
not_hate_encoding = tokenizer.encode("NOT HATEFUL")


In [59]:
example = hf_time_1["train"][0]
example.keys()

dict_keys(['clean_post', 'post', 'class', 'implicit_class', 'extra_implicit_class', 'target', 'implied_statement', 'split', 'time', 'formatted_prompt', 'label', '__index_level_0__', 'input_ids', 'attention_mask'])

In [61]:
end_prompt = example["input_ids"][0].index(end_of_text_token)
end_prompt

90

In [71]:
label_encoding = tokenizer.encode(example["label"] + tokenizer.eos_token)
label_encoding

[14065, 472, 2336, 49636, 151643]

In [84]:
tokenizer.decode(14065)

'NOT'

In [87]:
start_answer = end_prompt-(len(label_encoding)+1)

In [None]:
tokenizer.decode(example["input_ids"][0][: start_answer])

In [88]:
tokenizer.decode(example["input_ids"][0][start_answer : end_prompt - 1])

'NOT HATEFUL<|im_end|>'

In [138]:
example.keys()

dict_keys(['clean_post', 'post', 'class', 'implicit_class', 'extra_implicit_class', 'target', 'implied_statement', 'split', 'time', 'formatted_prompt', 'label', '__index_level_0__', 'input_ids', 'attention_mask'])

In [None]:
# base_prompt



In [190]:
tokenizer.decode(198)

'\n'

In [None]:
if tokenizer.pad_token is None: tokenizer.pad_token = '<|finetune_right_pad_id|>'

def preprocess_and_tokenize(clean_post, label, base_prompt=base_prompt, max_length=312):

    # if type(label) != list:
    #     label = [label]
    # if type(clean_post) != list:
    #     clean_post = [clean_post]
    
    prompt_plus_messages = base_prompt.format(clean_post)
    # pp(prompt_plus_messages)
    # pp(label)
    messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt_plus_messages},
            {"role": "assistant", "content": label.strip("\n")}
        ]

    # print(messages)
    chat_template = tokenizer.apply_chat_template(messages, tokenize=False, continue_final_message=False, add_special_tokens=False).rstrip()
    # print(chat_template)

    # why is the chat template putting a new line at the end of the end of sequence
    # pp(chat_template)
    input_ids_tokenized = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding="max_length", max_length=max_length)["input_ids"]

    # getting the normal text just to know how much we need to add to the left as -100 and right as pad token
    input_ids_shape = tokenizer(chat_template, return_tensors="pt", add_special_tokens=False, padding=False)["input_ids"]
    # print(input_ids_tokenized)

    # getting the label target to only predict the actual label and ignore the prompt
    labels_tokenized = tokenizer(label + tokenizer.eos_token, add_special_tokens=True, return_tensors="pt")["input_ids"]
    shape = input_ids_shape.shape[1] - labels_tokenized.shape[1]
    zeros = torch.zeros((1, shape), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    zeros.fill_(-100) # acc to llama docs
    labels_left_padded = torch.cat([zeros, labels_tokenized], dim=1)

    eos_n = input_ids_tokenized.shape[1] - labels_left_padded.shape[1]
    eos_n_tensor = torch.zeros((1, eos_n), dtype=labels_tokenized.dtype, device=labels_tokenized.device)
    eos_n_tensor.fill_(tokenizer.eos_token_id)
    labels_padded = torch.cat([labels_left_padded, eos_n_tensor], dim=1)

    # print(labels_padded.shape == input_ids_tokenized.shape)

    # shifting because we dont predict the first token
    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenized_right_shifted = labels_padded[:, 1:]

    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    
    return {
        "input_ids": input_ids_tokenized_left_shifted,
        "labels": labels_tokenized_right_shifted,
        "attention_mask": attention_mask
    }

In [214]:
hf_time_1 = hf_time_1.map(preprocess_and_tokenize, input_columns=["clean_post", "label"], batched=False)


Map: 100%|██████████| 718/718 [00:00<00:00, 821.43 examples/s]
Map: 100%|██████████| 5752/5752 [00:07<00:00, 818.96 examples/s]
Map: 100%|██████████| 720/720 [00:01<00:00, 622.72 examples/s]


In [None]:
hf_time_1.set_format("torch")
hf_time_2.set_format("torch")

cols_to_remove = ["clean_post", "post", "class", "implicit_class", "extra_implicit_class", "target", "implied_statement", "split", "time", "formatted_prompt", "label", "__index_level_0__"]

for split in hf_time_1:
    if split != "test":
        hf_time_1[split] = hf_time_1[split].remove_columns(cols_to_remove)
        hf_time_2[split] = hf_time_2[split].remove_columns(cols_to_remove)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


hf_time_1_train_loader = DataLoader(hf_time_1["train"], collate_fn=data_collator, batch_size=batch_size)
hf_time_1_validation_loader = DataLoader(hf_time_1["validation"], collate_fn=data_collator, batch_size=batch_size)
hf_time_1_test_loader = DataLoader(hf_time_1["test"], collate_fn=data_collator, batch_size=batch_size)

hf_time_2_train_loader = DataLoader(hf_time_2["train"], collate_fn=data_collator, batch_size=batch_size)
hf_time_2_validation_loader = DataLoader(hf_time_2["validation"], collate_fn=data_collator, batch_size=batch_size)
hf_time_2_test_loader = DataLoader(hf_time_2["test"], collate_fn=data_collator, batch_size=batch_size)

# ### So far, created the prompt, did the messages with the prompt and answer in place. Applied to chat template and tokenized 


In [8]:
########################3#################### MODEL WORK

print("_________________________________")
print("Loading the model and model config")

bnb_config = BitsAndBytesConfig(  
                                load_in_4bit= True,
                                bnb_4bit_quant_type= "nf4",
                                bnb_4bit_compute_dtype= torch.bfloat16,
                                bnb_4bit_use_double_quant= True,
                            )

model = AutoModelForCausalLM.from_pretrained(model_id,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto",
                                            quantization_config=bnb_config
                                            )

# to deal with the fact that we dont make the first token prediction??


model_size_before = sum(t.numel() for t in model.parameters())
print("Model Size before LoRA", model_size_before)
print(model)
print()

lora_alpha = lora_r*2
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
    lora_dropout=0.1,
    bias="none",
)

model = get_peft_model(model, config)
print("Model After LoRA")
model.print_trainable_parameters()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model.to(device)


loss_fn = CrossEntropyLoss()
optimizer = AdamW((param for param in model.parameters() if param.requires_grad), lr=lr)


_________________________________
Loading the model and model config
Model Size before LoRA 315119488
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layer

In [None]:
print("_________________________________")
print("Training the model")
print()

for epoch in range(n_epochs):

    torch.cuda.empty_cache()
    gc.collect()
    model.train()

    print("Epoch: ", epoch)
    losses = []

    for i, batch in enumerate(hf_time_1_train_loader):
        if i > 0:
            continue

        torch.cuda.empty_cache()
        gc.collect()

        print("\tBatch: ", i)
        # print(batch)
        batch.to(device)
        # print(batch.keys())
        # print(batch["input_ids"].shape)
        # print(batch["attention_mask"].shape)
        # print(batch["labels"].shape)


        batch = {k:torch.squeeze(v) for k,v in batch.items()}

        # print(batch["input_ids"].shape)
        # print(batch["attention_mask"].shape)
        # print(batch["labels"].shape)


        output = model(**batch)
        logits = output.logits
        loss = loss_fn(logits, batch["labels"])

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        losses.append(loss.detach().item())

        print(batch.keys())
        print(loss.detach().item())
        print(output.logits.shape)
        print(output.probas)

        if i > 3:
            continue

    epoch_loss = sum(losses)/len(hf_time_1_train_loader)
    print(f"Epoch {epoch} Loss: {epoch_loss}")

    model.eval()
    with torch.no_grad():  

        torch.cuda.empty_cache()
        gc.collect()

        val_losses = []

        for i, batch in enumerate(hf_time_1_validation_loader):
            if i > 0:
                continue
            batch.to(device)
            batch = {k:torch.squeeze(v) for k,v in batch.items()}

            output = model(**batch)
            logits = output.logits
            val_loss = loss_fn(logits, batch["labels"])

            val_losses.append(val_loss.detach().item())

        val_loss_epoch = sum(val_losses)/len(hf_time_1_validation_loader)
        print(f"Epoch {epoch} Validation Loss: {val_loss_epoch}")
print()


In [None]:
print("_________________________________")
print("Testing the model")
for i, test_batch in enumerate(hf_time_1["test"]):

    if i > 0:
        break
    
    text = test_batch["formatted_prompt"]
    tokenized_chat_template, messages_list = preprocess_and_tokenize(text, label=False, add_generation_prompt=True, output_messages_list=True)
    output = model.generate(**tokenized_chat_template.to(device))
    pred = tokenizer.decode(output[0], skip_special_tokens=True)
    
    print(text)
    print(tokenized_chat_template)
    print(output)
    print(pred)

print("CHECKING GENERATION")
print(messages_list)

print(tokenized_chat_template)
print(output)

print(type(output))
print(output.shape)

print("_________________________________")
print("Saving the model and Tokenizer")
model_name = model_id.split("/")[-1]
model.save_pretrained(f"alberto-lorente/{model_name}_test")
tokenizer.save_pretrained(f"alberto-lorente/{model_name}_test")

print("RUN SUCCESSFULLY")
