In [1]:
%%capture
# !pip install datasets
!pip install unsloth
!pip install datasets

In [2]:
!gdown --id 1yawwxvPEJYRVu6NtTnLUOgmYwJlxHxh_

Downloading...
From: https://drive.google.com/uc?id=1yawwxvPEJYRVu6NtTnLUOgmYwJlxHxh_
To: /content/final_training_data.csv
100% 125k/125k [00:00<00:00, 6.62MB/s]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, Dataset, DatasetDict

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Meta-Llama-3.1-8B",
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.10.2: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`


In [6]:
df = pd.read_csv(r'/content/final_training_data.csv')

In [7]:
df.head()

Unnamed: 0,coded_word,coded_usage,non_coded_usage,dataset
0,penguin,Bhai sorry say this but penguin kise pasand ha...,You can start with David Mcduff translation by...,coded
1,penguin,,Tell that to baby Penguin,coded
2,penguin,,"Even in your haul from yesterday, I especially...",coded
3,penguin,,Mujhe cake nah🖐️i penguin 🐧 chahiye,coded
4,penguin,,Why did Bruce Timm give us Dommy Mommy Penguin...,coded


In [8]:
print(df.shape)
print(df['dataset'].value_counts())

(513, 4)
dataset
slur     337
coded    176
Name: count, dtype: int64


In [9]:
df['nan_count'] = df[['coded_usage', 'non_coded_usage']].isnull().sum(axis=1)

# Define stratify column for equal distribution of 'dataset' and 'nan_count'
df['stratify_col'] = df['dataset'] + "_" + df['nan_count'].astype(str)

# Step 1: Split the data into train and temp (val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['stratify_col'], random_state=42)

# Step 2: Split temp into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['stratify_col'], random_state=42)

# Drop the helper columns
train_df = train_df.drop(columns=['nan_count', 'stratify_col'])
val_df = val_df.drop(columns=['nan_count', 'stratify_col'])
test_df = test_df.drop(columns=['nan_count', 'stratify_col'])

# Check the distribution of 'dataset' column and NaN values across the splits
print('Train Data', train_df['dataset'].value_counts(), train_df[['coded_usage', 'non_coded_usage']].isnull().sum())
print('Val Data', val_df['dataset'].value_counts(), val_df[['coded_usage', 'non_coded_usage']].isnull().sum())
print('Test Data', test_df['dataset'].value_counts(), test_df[['coded_usage', 'non_coded_usage']].isnull().sum())

Train Data dataset
slur     270
coded    140
Name: count, dtype: int64 coded_usage         88
non_coded_usage    234
dtype: int64
Val Data dataset
slur     33
coded    18
Name: count, dtype: int64 coded_usage        15
non_coded_usage    25
dtype: int64
Test Data dataset
slur     34
coded    18
Name: count, dtype: int64 coded_usage        14
non_coded_usage    27
dtype: int64


In [10]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [11]:
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])

In [12]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['coded_word', 'coded_usage', 'non_coded_usage', 'dataset'],
        num_rows: 410
    })
    validation: Dataset({
        features: ['coded_word', 'coded_usage', 'non_coded_usage', 'dataset'],
        num_rows: 51
    })
    test: Dataset({
        features: ['coded_word', 'coded_usage', 'non_coded_usage', 'dataset'],
        num_rows: 52
    })
})

In [16]:
def generate_coded_prompt(slur_word, harmful_usage=None, non_harmful_usage=None):

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    The system is finetuning the model to detect both overt and coded (dog whistle) uses of slur words in various contexts. Coded language, or "dog whistle" language, refers to suggestive or implicit phrases that may appear harmless but are used with harmful intentions.
    Use the following examples to learn how to distinguish between overtly coded, and non-coded uses of language.

    <|start_header_id|>user<|end_header_id|>
    Here is information about a potentially coded word (with examples):
    - Coded Word: {slur_word}
    """

    # Add harmful usage if provided
    if harmful_usage:
        prompt += f"\n\n**Harmful usage:**\n{harmful_usage}"

    # Add non-harmful usage if provided
    if non_harmful_usage:
        prompt += f"\n\n**Non-harmful usage:**\n{non_harmful_usage}"

    prompt += f"""

    <|start_header_id|>assistant<|end_header_id|>
    I understand. When analyzing this word and its usage examples, I will consider the context, intent, and potential impact.
    The harmful usage is offensive or damaging, while the non-harmful might not express hostility.

    <|eot_id|><|end_of_text|>"""

    return prompt

def generate_hateful_prompt(slur_word, harmful_usage=None, non_harmful_usage=None):

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    The system is finetuning the model to detect both hateful uses of slur words in various contexts. Hate Speech is abusive or threatening speech or writing that expresses prejudice on the basis of ethnicity, religion, sexual orientation, or similar grounds.
    Use the following examples to learn how to distinguish between hateful, and non-hateful uses of language.

    <|start_header_id|>user<|end_header_id|>
    Here is information about a hateful word (with examples):
    - Hateful Word: {slur_word}
    """

    # Add harmful usage if provided
    if harmful_usage:
        prompt += f"\n\n**Hateful usage:**\n{harmful_usage}"

    # Add non-harmful usage if provided
    if non_harmful_usage:
        prompt += f"\n\n**Non-hateful usage:**\n{non_harmful_usage}"

    prompt += f"""

    <|start_header_id|>assistant<|end_header_id|>
    I understand. When analyzing this word and its usage examples, I will consider the context, intent, and potential impact.
    The harmful usage is offensive or damaging, while the non-harmful might not express hostility.

    <|eot_id|><|end_of_text|>"""

    return prompt

In [17]:
def apply_generate_prompt(row):
    if row['dataset'] == 'coded':
        return generate_coded_prompt(row['coded_word'], row['coded_usage'], row['non_coded_usage'])
    elif row['dataset'] == 'slur':
        return generate_hateful_prompt(row['coded_word'], row['coded_usage'], row['non_coded_usage'])
    else:
        return None

# Apply the function to each dataframe
train_df['prompt'] = train_df.apply(apply_generate_prompt, axis=1)
val_df['prompt'] = val_df.apply(apply_generate_prompt, axis=1)
test_df['prompt'] = test_df.apply(apply_generate_prompt, axis=1)

In [18]:
# %%time
# def tokenize_function(examples):
#     return tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=max_seq_length)

train_dataset = Dataset.from_pandas(train_df[['prompt']])
val_dataset = Dataset.from_pandas(val_df[['prompt']])
test_dataset = Dataset.from_pandas(test_df[['prompt']])

train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt'],
        num_rows: 410
    })
    validation: Dataset({
        features: ['prompt'],
        num_rows: 51
    })
    test: Dataset({
        features: ['prompt'],
        num_rows: 52
    })
})

In [20]:
%%time
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


CPU times: user 4.94 s, sys: 24.9 ms, total: 4.97 s
Wall time: 4.99 s


In [21]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    dataset_text_field = "prompt",  # This tells the trainer where the raw text is
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,  # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        # warmup_steps = 30,
        num_train_epochs = 10,
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",  # Use this for WandB, TensorBoard, etc.
    ),
)

Map (num_proc=2):   0%|          | 0/410 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/51 [00:00<?, ? examples/s]

In [22]:
%%time
trainer_stats = trainer.train()

**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers and Unsloth!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 410 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 250
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,2.4128
20,0.9955
30,1.0154
40,0.9133
50,0.9159
60,0.7632
70,0.8096
80,0.8436
90,0.6963
100,0.7544


CPU times: user 37min 22s, sys: 27min 45s, total: 1h 5min 8s
Wall time: 1h 5min 45s


In [23]:
%%time
# Save the fine-tuned model and tokenizer
output_dir = "./final_trained_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

CPU times: user 403 ms, sys: 195 ms, total: 598 ms
Wall time: 4 s


('./final_trained_model/tokenizer_config.json',
 './final_trained_model/special_tokens_map.json',
 './final_trained_model/tokenizer.json')

In [24]:
import shutil
import os
zip_filename = r"final_trained_model"

shutil.make_archive(zip_filename, 'zip', output_dir)
zip_file_size = os.path.getsize(zip_filename)

print(f"Model zipped as {zip_filename}")
print(f"Size of the zip file: {zip_file_size / (1024 * 1024):.2f} MB")

Model zipped as final_trained_model
Size of the zip file: 0.00 MB


## we now generate the test file again

In [25]:
df['nan_count'] = df[['coded_usage', 'non_coded_usage']].isnull().sum(axis=1)

# Define stratify column for equal distribution of 'dataset' and 'nan_count'
df['stratify_col'] = df['dataset'] + "_" + df['nan_count'].astype(str)

# Step 1: Split the data into train and temp (val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['stratify_col'], random_state=42)

# Step 2: Split temp into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['stratify_col'], random_state=42)

# Drop the helper columns
train_df = train_df.drop(columns=['nan_count', 'stratify_col'])
val_df = val_df.drop(columns=['nan_count', 'stratify_col'])
test_df = test_df.drop(columns=['nan_count', 'stratify_col'])

# Check the distribution of 'dataset' column and NaN values across the splits
print('Train Data', train_df['dataset'].value_counts(), train_df[['coded_usage', 'non_coded_usage']].isnull().sum())
print('Val Data', val_df['dataset'].value_counts(), val_df[['coded_usage', 'non_coded_usage']].isnull().sum())
print('Test Data', test_df['dataset'].value_counts(), test_df[['coded_usage', 'non_coded_usage']].isnull().sum())

Train Data dataset
slur     270
coded    140
Name: count, dtype: int64 coded_usage         88
non_coded_usage    234
dtype: int64
Val Data dataset
slur     33
coded    18
Name: count, dtype: int64 coded_usage        15
non_coded_usage    25
dtype: int64
Test Data dataset
slur     34
coded    18
Name: count, dtype: int64 coded_usage        14
non_coded_usage    27
dtype: int64


In [27]:
test_df.shape

(52, 4)

In [32]:
df_test_new = pd.DataFrame(columns=["word", "text", "label"])

# Process coded_usage
coded_usage_df = test_df[['coded_word', 'coded_usage']].dropna()
coded_usage_df['label'] = 'unsafe'
coded_usage_df.columns = ['word', 'text', 'label']

# Process non_coded_usage
non_coded_usage_df = test_df[['coded_word', 'non_coded_usage']].dropna()
non_coded_usage_df['label'] = 'safe'
non_coded_usage_df.columns = ['word', 'text', 'label']

# Combine both dataframes
df_test_new = pd.concat([coded_usage_df, non_coded_usage_df], ignore_index=True)

In [34]:
df_test_new['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
unsafe,38
safe,25


In [37]:
# df_test_new.to_csv('test-final.csv', index=False)

In [44]:
model = FastLanguageModel.for_inference(model)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
      

In [96]:
# # Helper function to generate prompts for inference
# def generate_inference_prompt(text):
#     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
#     Please analyze the following text and determine whether it is harmful or not, only respond in one word, "harmful" or "unharmful":

#     <|start_header_id|>user<|end_header_id|>
#     Text: {text}

#     <|start_header_id|>assistant<|end_header_id|>
#     The text is"""
#     return prompt

# def generate_inference_prompt(text):
#     prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
#     Please analyze the following text and determine whether it is harmful or not, only respond in one word, "harmful" or "unharmful"
#     reply in one word only, that is a must

#     <|start_header_id|>user<|end_header_id|>
#     Text: {text}

#     <|eot_id|><|end_of_text|>"""

#     return prompt

def generate_inference_prompt(text):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You must respond with a single word, either "harmful" or "not-harmful". Do not use any other words.

    <|start_header_id|>user<|end_header_id|>
    Classify the following text as either harmful or not-harmful: "{text}"

    <|start_header_id|>assistant<|end_header_id|>"""

    return prompt

In [108]:
def get_prediction(text):
    prompt = generate_inference_prompt(text)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=max_seq_length)

    if torch.cuda.is_available():
        inputs = {key: value.cuda() for key, value in inputs.items()}
        model.cuda()

    # print(f"Input sequence length: {inputs['input_ids'].shape[1]}")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,  # Reduced to focus on shorter outputs
            do_sample=True,    # Keep some randomness
            num_beams=1,       # No beam search
            temperature=0.7,   # Slightly lower temperature for more focused output
            top_p=0.9,         # Nucleus sampling
            no_repeat_ngram_size=2
        )

    input_length = inputs['input_ids'].shape[1]

    # print(f"Total output sequence length: {outputs.shape[1]}")

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    new_tokens = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

    # print(f"Full output: {full_output}")
    # print(f"New tokens: {new_tokens}")

    # Post-process the prediction
    prediction = new_tokens.strip().lower()
    harmful_words = ["harmful", "hateful", "offensive"]
    if any(word in prediction for word in harmful_words):
        return "unsafe"
    else:
        return "safe"

# Test the function
test_text = "I do like you very very much"
result = get_prediction(test_text)
print(f"Final prediction: {result}")

Final prediction: safe


In [109]:
%%time
df_test_new['prediction'] = df_test_new['text'].apply(get_prediction)

CPU times: user 34.5 s, sys: 2.35 s, total: 36.8 s
Wall time: 37.1 s


In [110]:
df_test_new

Unnamed: 0,word,text,label,prediction
0,Mulli,"Mulli ka kaam hi yhi hota hai, beech raste m k...",unsafe,safe
1,Ugly,It's only creepy if you're ugly,unsafe,unsafe
2,abdul,Just remember 90% of people were educated as p...,unsafe,unsafe
3,mutth,tu sirf mutth maar sakta hai woh bhi porn sub ...,unsafe,unsafe
4,burnol,Bro Kalki will never cross Jawan's collection....,unsafe,safe
...,...,...,...,...
58,nymph,"And then Goomerus arrives, with his stack of n...",safe,unsafe
59,abdul,"My take -\n\nHonest approach, good performance...",safe,unsafe
60,librandu,When did this sub become r/librandu,safe,unsafe
61,talwar,Sorry I meant Talwar. But even that was writte...,safe,safe


In [111]:
df_test_new['prediction'][0]

'safe'

In [112]:
df_test_new = df_test_new[['text', 'label', 'prediction']]

In [113]:
df_test_new

Unnamed: 0,text,label,prediction
0,"Mulli ka kaam hi yhi hota hai, beech raste m k...",unsafe,safe
1,It's only creepy if you're ugly,unsafe,unsafe
2,Just remember 90% of people were educated as p...,unsafe,unsafe
3,tu sirf mutth maar sakta hai woh bhi porn sub ...,unsafe,unsafe
4,Bro Kalki will never cross Jawan's collection....,unsafe,safe
...,...,...,...
58,"And then Goomerus arrives, with his stack of n...",safe,unsafe
59,"My take -\n\nHonest approach, good performance...",safe,unsafe
60,When did this sub become r/librandu,safe,unsafe
61,Sorry I meant Talwar. But even that was writte...,safe,safe


In [114]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import pandas as pd

# Assuming your dataframe is df_test_new and the columns are 'label' and 'prediction'

# 1. Calculate accuracy score
accuracy = accuracy_score(df_test_new['label'], df_test_new['prediction'])
print(f"Accuracy: {accuracy:.4f}")

# 2. Calculate F1-macro score
f1_macro = f1_score(df_test_new['label'], df_test_new['prediction'], average='macro')
print(f"F1-macro Score: {f1_macro:.4f}")

# 3. Generate classification report
report = classification_report(df_test_new['label'], df_test_new['prediction'])
print("Classification Report:\n", report)

# 4. Generate confusion matrix
conf_matrix = confusion_matrix(df_test_new['label'], df_test_new['prediction'])
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.4921
F1-macro Score: 0.4815
Classification Report:
               precision    recall  f1-score   support

        safe       0.38      0.44      0.41        25
      unsafe       0.59      0.53      0.56        38

    accuracy                           0.49        63
   macro avg       0.48      0.48      0.48        63
weighted avg       0.51      0.49      0.50        63

Confusion Matrix:
 [[11 14]
 [18 20]]
