In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer,pipeline
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import resample

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/dataset.csv'
import pandas as pd
df = pd.read_csv(file_path)

In [None]:
df = df.dropna(subset=['user_prompt', 'safe_flag'])
df['safe_flag'] = df['safe_flag'].astype(int)

In [None]:
df.drop('Unnamed: 0',inplace=True,axis=1)

In [None]:
df_minority = df[df.safe_flag == 1]
df_majority = df[df.safe_flag == 0]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
print("Balanced label counts:\n", df_balanced['safe_flag'].value_counts())

Balanced label counts:
 safe_flag
0    27076
1    27076
Name: count, dtype: int64


In [None]:
encoder = LabelEncoder()
df_balanced['safe_flag'] = encoder.fit_transform(df_balanced['safe_flag'])

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['user_prompt'], df_balanced['safe_flag'], test_size=0.2, random_state=42
)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding="max_length", max_length=128)

train_ds = Dataset.from_dict({"text": train_texts.tolist(), "labels": train_labels.tolist()})
val_ds   = Dataset.from_dict({"text": val_texts.tolist(),   "labels": val_labels.tolist()})

train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize, batched=True, remove_columns=["text"])

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/43321 [00:00<?, ? examples/s]

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(encoder.classes_))

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.utils.class_weight import compute_class_weight

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='.drive/MyDrive/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# 📌 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 📌 Train model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mavipsa2004swain[0m ([33mavipsa2004swain-iit-bombay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.6969
20,0.6888
30,0.6654
40,0.5601
50,0.6176
60,0.6268
70,0.6813
80,0.6547
90,0.4866
100,0.5762


TrainOutput(global_step=8124, training_loss=0.29961795012449643, metrics={'train_runtime': 3189.4922, 'train_samples_per_second': 40.747, 'train_steps_per_second': 2.547, 'total_flos': 8548675521937920.0, 'train_loss': 0.29961795012449643, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.33192360401153564,
 'eval_accuracy': 0.9028713876835011,
 'eval_f1': 0.901424287856072,
 'eval_runtime': 74.568,
 'eval_samples_per_second': 145.25,
 'eval_steps_per_second': 9.079,
 'epoch': 3.0}

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.34027019143104553, 'eval_accuracy': 0.9105345766780537, 'eval_f1': 0.9122044033704811, 'eval_runtime': 73.4259, 'eval_samples_per_second': 147.509, 'eval_steps_per_second': 9.22, 'epoch': 3.0}


In [None]:
trainer.save_model('drive/MyDrive/results')
tokenizer.save_pretrained('drive/MyDrive/results')

('drive/MyDrive/results/tokenizer_config.json',
 'drive/MyDrive/results/special_tokens_map.json',
 'drive/MyDrive/results/vocab.json',
 'drive/MyDrive/results/merges.txt',
 'drive/MyDrive/results/added_tokens.json')

In [None]:
model = RobertaForSequenceClassification.from_pretrained("drive/MyDrive/results")
tokenizer = RobertaTokenizer.from_pretrained("drive/MyDrive/results")

In [None]:
clf = pipeline("text-classification", model=model, tokenizer=tokenizer)

print(clf("how to make a bomb"))
print(clf("what is a bath bomb"))
print(clf("recipe for chocolate cake"))

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9873774647712708}]
[{'label': 'LABEL_0', 'score': 0.9867686629295349}]
[{'label': 'LABEL_1', 'score': 0.9969052672386169}]


In [None]:
print(clf("popp"))
print(clf("instructions to kill"))

[{'label': 'LABEL_1', 'score': 0.9579933881759644}]
[{'label': 'LABEL_0', 'score': 0.9814915657043457}]
