In [1]:
!pip install transformers datasets torch pandas

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)
from sklearn.metrics import accuracy_score, classification_report

train_file = "/content/sample_data/toxicbias_train_updated.csv"
val_file = "/content/sample_data/toxicbias_val_updated.csv"
test_file = "/content/sample_data/toxicbias_test_updated.csv"
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)


for df in [train_df, val_df, test_df]:
    df.dropna(subset=['comment_text'], inplace=True)
    df['bias'] = df['bias'].fillna('unknown')
    df['category'] = df['category'].fillna('unknown')


train_df['combined_label'] = train_df['bias'] + '__' + train_df['category']
val_df['combined_label'] = val_df['bias'] + '__' + val_df['category']
test_df['combined_label'] = test_df['bias'] + '__' + test_df['category']


combined_encoder = LabelEncoder()
train_df['combined_label_id'] = combined_encoder.fit_transform(train_df['combined_label'])
val_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in val_df['combined_label']]
)
test_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in test_df['combined_label']]
)


MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=256)

def create_dataset(df):
    df = df.reset_index(drop=True)
    dataset = Dataset.from_pandas(df[['comment_text', 'combined_label_id']])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.rename_column('combined_label_id', "labels")
    return dataset

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)
test_dataset = create_dataset(test_df)


training_args = TrainingArguments(
    output_dir="./results_combined",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    fp16=True,
    optim="adamw_torch",
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    remove_unused_columns=False
)


device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(combined_encoder.classes_)
).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(pred.label_ids, np.argmax(pred.predictions, axis=1))}
)


trainer.train()

for name, dataset in zip(["Validation", "Test"], [val_dataset, test_dataset]):
    pred = trainer.predict(dataset)
    y_true = dataset['labels']
    y_pred = np.argmax(pred.predictions, axis=-1)
    print(f"\n{name} Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))


def predict_bias_and_debias(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    if 'sentence' in df.columns:
        df.rename(columns={'sentence': 'comment_text'}, inplace=True)
    df['comment_text'] = df['comment_text'].fillna('')

    # Tokenize
    encodings = tokenizer(df['comment_text'].tolist(), padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encodings = {k: v.to(device) for k, v in encodings.items()}

    # Predict
    model.eval()
    with torch.no_grad():
        logits = model(**encodings).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        decoded = combined_encoder.inverse_transform(preds)
        df['predicted_bias'] = [x.split('__')[0] for x in decoded]
        df['predicted_category'] = [x.split('__')[1] for x in decoded]

    # Debias
    rewrite_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)
    df['self_debiased_text'] = df.apply(
        lambda row: rewrite_model(
            f"Make this statement neutral and unbiased by removing gender,lgbtq,cultural,political,religion and racial bias: {row['comment_text']}"
        )[0]['generated_text']
        if row['predicted_bias'] == 'bias' else row['comment_text'], axis=1)

    df.to_csv(output_csv, index=False)
    print(f" Predictions + Self-debiasing saved to {output_csv}")


predict_bias_and_debias("/content/sample_data/whisper_transcriptions_extra.csv", "predicted_bias_category.csv")
predict_bias_and_debias("/content/sample_data/whisper_transcriptions_extra_4.csv", "predicted_bias_category_4.csv")


Map:   0%|          | 0/4327 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Map:   0%|          | 0/650 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.227975,0.650463
2,No log,1.10648,0.678241
3,No log,0.975783,0.675926
4,1.384000,1.052934,0.666667
5,1.384000,1.090743,0.664352



Validation Accuracy: 0.6643518518518519
              precision    recall  f1-score   support

           0       0.36      0.56      0.43         9
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           7       0.63      0.92      0.75        36
           8       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          20       0.39      0.58      0.47        12
          27       0.00      0.00      0.00         2
          30       0.00      0.00      0.00         3
          31       0.78      0.78      0.78       153
          35       0.33      0.45      0.38        11
          45       0.31      0.50      0.38         8
          49       0.00      0.00      0.00         1
          50       0.76      0.90      0.83       101
          52       0.00      0.00      0.00         2
          54       0.00      0.00      0.00         1
          56       0.00      0.00      0


Test Accuracy: 0.6876923076923077
              precision    recall  f1-score   support

           0       0.69      0.85      0.76        13
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.72      0.85      0.78        62
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         4
          20       0.33      0.82      0.47        11
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1
          27       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         2
          31       0.78      0.85      0.82       204
          35       0.57      0.63      0.60   

Device set to use cuda:0


✅ Predictions + Self-debiasing saved to predicted_bias_category.csv


Device set to use cuda:0


✅ Predictions + Self-debiasing saved to predicted_bias_category_4.csv


deberta model

In [None]:
import pandas as pd
# Combine all three CSVs into one
train_df = pd.read_csv("/content/sample_data/toxicbias_train_updated.csv")
val_df = pd.read_csv("/content/sample_data/toxicbias_val_updated.csv")
test_df = pd.read_csv("/content/sample_data/toxicbias_test_updated.csv")

full_df = pd.concat([train_df, val_df, test_df])
full_df.to_csv("/content/sample_data/toxicbias_all.csv", index=False)


In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    pipeline,
)
from sklearn.metrics import accuracy_score, classification_report

# ✅ Load combined dataset
df = pd.read_csv("/content/sample_data/toxicbias_all.csv")
df.dropna(subset=['comment_text'], inplace=True)
df['bias'] = df['bias'].fillna('unknown')
df['category'] = df['category'].fillna('unknown')
df['combined_label'] = df['bias'] + '__' + df['category']

# ✅ Initial rare label grouping before first split
RARE_THRESHOLD = 2
label_counts = df['combined_label'].value_counts()
df['combined_label'] = df['combined_label'].apply(
    lambda x: x if label_counts[x] >= RARE_THRESHOLD else 'other__other'
)

# ✅ Encode labels
label_encoder = LabelEncoder()
df['combined_label_id'] = label_encoder.fit_transform(df['combined_label'])

# ✅ First split: Train vs Temp (stratified)
train_df, temp_df = train_test_split(
    df, test_size=0.3, stratify=df['combined_label_id'], random_state=42
)

# ✅ Re-group rare labels in temp_df before second split
temp_counts = temp_df['combined_label'].value_counts()
temp_df['combined_label'] = temp_df['combined_label'].apply(
    lambda x: x if temp_counts[x] >= 2 else 'other__other'
)
temp_df['combined_label_id'] = label_encoder.transform(temp_df['combined_label'])

# ✅ Final split: Val vs Test
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['combined_label_id'], random_state=42
)

# ✅ Tokenization using DeBERTa
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=256)

def create_dataset(df):
    df = df.reset_index(drop=True)
    dataset = Dataset.from_pandas(df[['comment_text', 'combined_label_id']])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.rename_column("combined_label_id", "labels")
    return dataset

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)
test_dataset = create_dataset(test_df)

# ✅ Training arguments
training_args = TrainingArguments(
    output_dir="./results_combined",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True if torch.cuda.is_available() else False,
    logging_dir="./logs",
    report_to="none",
    metric_for_best_model="accuracy"
)

# ✅ Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_encoder.classes_)
).to(device)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# ✅ Train the model
trainer.train()

# ✅ Evaluation
for name, dataset in zip(["Validation", "Test"], [val_dataset, test_dataset]):
    pred = trainer.predict(dataset)
    y_true = dataset["labels"]
    y_pred = np.argmax(pred.predictions, axis=1)
    print(f"\n{name} Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))

# ✅ Whisper Prediction + Self-Debiasing
def predict_and_debias(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    if 'sentence' in df.columns:
        df.rename(columns={'sentence': 'comment_text'}, inplace=True)
    df['comment_text'] = df['comment_text'].fillna('')

    enc = tokenizer(df['comment_text'].tolist(), padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**enc).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        decoded = label_encoder.inverse_transform(preds)
        df['predicted_bias'] = [x.split('__')[0] for x in decoded]
        df['predicted_category'] = [x.split('__')[1] for x in decoded]

    # ✅ Self-debiasing with Flan-T5
    rewrite_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)
    df['self_debiased_text'] = df.apply(
        lambda row: rewrite_model(
            f"Make this statement neutral and unbiased by removing religional and racial bias: {row['comment_text']}"
        )[0]['generated_text']
        if row['predicted_bias'] == 'bias' else row['comment_text'], axis=1)

    df.to_csv(output_csv, index=False)
    print(f"✅ Saved to {output_csv}")

# ✅ Run predictions on both files
predict_and_debias("/content/sample_data/whisper_transcriptions_extra.csv", "predicted_bias_category.csv")
predict_and_debias("/content/sample_data/whisper_transcriptions_extra_4.csv", "predicted_bias_category_4.csv")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/3786 [00:00<?, ? examples/s]

Map:   0%|          | 0/811 [00:00<?, ? examples/s]

Map:   0%|          | 0/812 [00:00<?, ? examples/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.289016,0.639951
2,No log,1.212638,0.647349
3,1.745000,1.088788,0.678175
4,1.745000,1.07137,0.68434
5,1.070700,1.04399,0.680641
6,1.070700,1.055701,0.680641



Validation Accuracy: 0.6843403205918619
              precision    recall  f1-score   support

           0       0.53      0.90      0.67        20
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         2
           7       0.63      0.88      0.73        65
           8       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00        17
          15       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         3
          21       0.72      0.95      0.82       271
          23       0.00      0.00      0.00        18
          24       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         3
          27       0.00      0.00      0.00         2
          29       0.31      0.75      0


Test Accuracy: 0.6773399014778325
              precision    recall  f1-score   support

           0       0.65      0.85      0.74        20
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           7       0.77      0.91      0.83        66
           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00        17
          15       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          20       0.00      0.00      0.00         3
          21       0.69      0.98      0.81       272
          23       0.00      0.00      0.00        19
          24       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         2
          27       0.00      0.00      0.00         2
          29       0.30      0.65      0.41   

Device set to use cuda:0


✅ Saved to predicted_bias_category.csv


Device set to use cuda:0


✅ Saved to predicted_bias_category_4.csv


In [3]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, classification_report, f1_score

train_file = "/content/sample_data/toxicbias_train_updated.csv"
val_file = "/content/sample_data/toxicbias_val_updated.csv"
test_file = "/content/sample_data/toxicbias_test_updated.csv"
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

for df in [train_df, val_df, test_df]:
    df.dropna(subset=['comment_text'], inplace=True)
    df['bias'] = df['bias'].fillna('unknown')
    df['category'] = df['category'].fillna('unknown')

train_df['combined_label'] = train_df['bias'] + '__' + train_df['category']
val_df['combined_label'] = val_df['bias'] + '__' + val_df['category']
test_df['combined_label'] = test_df['bias'] + '__' + test_df['category']

combined_encoder = LabelEncoder()
train_df['combined_label_id'] = combined_encoder.fit_transform(train_df['combined_label'])
val_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in val_df['combined_label']]
)
test_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in test_df['combined_label']]
)

MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=256)

def create_dataset(df):
    df = df.reset_index(drop=True)
    dataset = Dataset.from_pandas(df[['comment_text', 'combined_label_id']])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.rename_column('combined_label_id', "labels")
    return dataset

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)
test_dataset = create_dataset(test_df)

training_args = TrainingArguments(
    output_dir="./results_combined",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    fp16=True,
    optim="adamw_torch",
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    remove_unused_columns=False
)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(pred.label_ids, preds),
        "f1_macro": f1_score(pred.label_ids, preds, average='macro')
    }

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(combined_encoder.classes_)
).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

for name, dataset in zip(["Validation", "Test"], [val_dataset, test_dataset]):
    pred = trainer.predict(dataset)
    y_true = dataset['labels']
    y_pred = np.argmax(pred.predictions, axis=-1)
    print(f"\n{name} Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))

def predict_bias_and_debias(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    if 'sentence' in df.columns:
        df.rename(columns={'sentence': 'comment_text'}, inplace=True)
    df['comment_text'] = df['comment_text'].fillna('')

    encodings = tokenizer(df['comment_text'].tolist(), padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encodings = {k: v.to(device) for k, v in encodings.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**encodings).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        decoded = combined_encoder.inverse_transform(preds)
        df['predicted_bias'] = [x.split('__')[0] for x in decoded]
        df['predicted_category'] = [x.split('__')[1] for x in decoded]

    rewrite_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)
    df['self_debiased_text'] = df.apply(
        lambda row: rewrite_model(
            f"Make this statement neutral and unbiased by removing gender,lgbtq,cultural,political,religion and racial bias: {row['comment_text']}"
        )[0]['generated_text']
        if row['predicted_bias'] == 'bias' else row['comment_text'], axis=1)

    df.to_csv(output_csv, index=False)
    print(f" Predictions + Self-debiasing saved to {output_csv}")

predict_bias_and_debias("/content/sample_data/whisper_transcriptions_extra.csv", "predicted_bias_category.csv")
predict_bias_and_debias("/content/sample_data/whisper_transcriptions_extra_4.csv", "predicted_bias_category_4.csv")


Map:   0%|          | 0/4327 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Map:   0%|          | 0/650 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,1.577225,0.578704,0.083149
2,No log,1.161978,0.680556,0.157446
3,No log,1.007056,0.673611,0.188844
4,1.694600,1.123608,0.685185,0.217672
5,1.694600,1.073071,0.668981,0.214759
6,1.694600,1.24536,0.652778,0.222763
7,0.626400,1.245796,0.659722,0.226566



Validation Accuracy: 0.6574074074074074
              precision    recall  f1-score   support

           0       0.36      0.56      0.43         9
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           7       0.61      0.83      0.71        36
           8       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          20       0.36      0.42      0.38        12
          27       0.00      0.00      0.00         2
          30       0.00      0.00      0.00         3
          31       0.81      0.76      0.78       153
          35       0.43      0.55      0.48        11
          45       0.24      0.50      0.32         8
          49       0.00      0.00      0.00         1
          50       0.76      0.89      0.82       101
          52       0.00      0.00      0.00         2
          54       0.00      0.00      0.00         1
          56       0.00      0.00      0


Test Accuracy: 0.6753846153846154
              precision    recall  f1-score   support

           0       0.65      0.85      0.73        13
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.71      0.85      0.77        62
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         4
          20       0.32      0.82      0.46        11
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1
          27       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         2
          31       0.79      0.79      0.79       204
          35       0.52      0.63      0.57   

FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/whisper_transcriptions_extra.csv'

**FINAL MODEL**

In [5]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, classification_report, f1_score

train_file = "/content/sample_data/toxicbias_train_updated.csv"
val_file = "/content/sample_data/toxicbias_val_updated.csv"
test_file = "/content/sample_data/toxicbias_test_updated.csv"
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)
test_df = pd.read_csv(test_file)

for df in [train_df, val_df, test_df]:
    df.dropna(subset=['comment_text'], inplace=True)
    df['bias'] = df['bias'].fillna('unknown')
    df['category'] = df['category'].fillna('unknown')

train_df['combined_label'] = train_df['bias'] + '__' + train_df['category']
val_df['combined_label'] = val_df['bias'] + '__' + val_df['category']
test_df['combined_label'] = test_df['bias'] + '__' + test_df['category']

combined_encoder = LabelEncoder()
train_df['combined_label_id'] = combined_encoder.fit_transform(train_df['combined_label'])
val_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in val_df['combined_label']]
)
test_df['combined_label_id'] = combined_encoder.transform(
    [label if label in combined_encoder.classes_ else combined_encoder.classes_[0] for label in test_df['combined_label']]
)

MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=256)

def create_dataset(df):
    df = df.reset_index(drop=True)
    dataset = Dataset.from_pandas(df[['comment_text', 'combined_label_id']])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.rename_column('combined_label_id', "labels")
    return dataset

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)
test_dataset = create_dataset(test_df)

training_args = TrainingArguments(
    output_dir="./results_combined",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    fp16=True,
    optim="adamw_torch",
    gradient_accumulation_steps=2,
    learning_rate=1.5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.3,
    remove_unused_columns=False
)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(pred.label_ids, preds),
        "f1_macro": f1_score(pred.label_ids, preds, average='macro')
    }

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(combined_encoder.classes_)
).to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

for name, dataset in zip(["Validation", "Test"], [val_dataset, test_dataset]):
    pred = trainer.predict(dataset)
    y_true = dataset['labels']
    y_pred = np.argmax(pred.predictions, axis=-1)
    print(f"\n{name} Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))

def predict_bias_and_debias(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    if 'sentence' in df.columns:
        df.rename(columns={'sentence': 'comment_text'}, inplace=True)
    df['comment_text'] = df['comment_text'].fillna('')

    encodings = tokenizer(df['comment_text'].tolist(), padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encodings = {k: v.to(device) for k, v in encodings.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**encodings).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        decoded = combined_encoder.inverse_transform(preds)
        df['predicted_bias'] = [x.split('__')[0] for x in decoded]
        df['predicted_category'] = [x.split('__')[1] for x in decoded]

    rewrite_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0 if torch.cuda.is_available() else -1)
    df['self_debiased_text'] = df.apply(
        lambda row: rewrite_model(
            f"Make this statement neutral and unbiased by removing gender,lgbtq,cultural,political,religion and racial bias: {row['comment_text']}"
        )[0]['generated_text']
        if row['predicted_bias'] == 'bias' else row['comment_text'], axis=1)

    df.to_csv(output_csv, index=False)
    print(f" Predictions + Self-debiasing saved to {output_csv}")

predict_bias_and_debias("/content/sample_data/whisper_processed_transcriptions_extra.csv", "predicted_bias_category.csv")
predict_bias_and_debias("/content/sample_data/whisper_transcriptions_extra_4.csv", "predicted_bias_category_4.csv")


Map:   0%|          | 0/4327 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Map:   0%|          | 0/650 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,1.468271,0.627315,0.118226
2,2.261500,1.211754,0.664352,0.146822
3,2.261500,1.017178,0.666667,0.215229
4,1.048400,1.065324,0.685185,0.22643
5,0.737400,1.095908,0.680556,0.238366



Validation Accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.38      0.56      0.45         9
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           7       0.67      0.92      0.78        36
           8       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          20       0.42      0.42      0.42        12
          27       0.00      0.00      0.00         2
          30       0.00      0.00      0.00         3
          31       0.80      0.82      0.81       153
          35       0.50      0.45      0.48        11
          45       0.31      0.50      0.38         8
          49       0.00      0.00      0.00         1
          50       0.77      0.88      0.82       101
          52       0.00      0.00      0.00         2
          54       0.00      0.00      0.00         1
          56       0.00      0.00      0.00         


Test Accuracy: 0.7
              precision    recall  f1-score   support

           0       0.86      0.92      0.89        13
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.73      0.85      0.79        62
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         4
          20       0.36      0.82      0.50        11
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1
          27       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         2
          31       0.78      0.86      0.82       204
          35       0.57      0.63      0.60        19
       

Device set to use cuda:0


 Predictions + Self-debiasing saved to predicted_bias_category.csv


Device set to use cuda:0


 Predictions + Self-debiasing saved to predicted_bias_category_4.csv
