In [8]:
import json
import re
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load JSON training dataset
train_file_path = "/content/z639_assignment1_training.json"
with open(train_file_path, "r") as file:
    json_train_data = [json.loads(line) for line in file]

# Inspect first record to check available keys
print("Sample Record Keys:", json_train_data[0].keys())

# Function for text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

# Generate majority vote toxicity label
for entry in json_train_data:
    if "composite_toxic" in entry:
        toxicity_votes = [annotation[0] for annotation in entry["composite_toxic"]]
        toxic_count = sum(toxicity_votes)
        entry["toxic_label"] = toxic_count >= (len(toxicity_votes) / 2)  # Majority voting
    else:
        entry["toxic_label"] = False  # Default to non-toxic if no annotations exist

    entry['clean_text'] = clean_text(entry['text'])

# Extract text and labels
texts = [entry['clean_text'] for entry in json_train_data]
labels = [entry['toxic_label'] for entry in json_train_data]

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, train_labels)
val_preds_lr = log_reg.predict(X_val)

# Evaluate Logistic Regression
lr_metrics = {
    "Accuracy": accuracy_score(val_labels, val_preds_lr),
    "Precision": precision_score(val_labels, val_preds_lr),
    "Recall": recall_score(val_labels, val_preds_lr),
    "F1-Score": f1_score(val_labels, val_preds_lr),
}
print("Logistic Regression Performance:", lr_metrics)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define custom dataset class
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze().to(device),
            "attention_mask": encoding["attention_mask"].squeeze().to(device),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long).to(device),
        }

# Create datasets
train_dataset = ToxicCommentDataset(train_texts, train_labels, tokenizer)
val_dataset = ToxicCommentDataset(val_texts, val_labels, tokenizer)

# Load pre-trained BERT model
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce batch size to avoid memory issues
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True if torch.cuda.is_available() else False,  # Mixed precision only for GPU
    dataloader_pin_memory=False  # Disable pinning memory
)

# Train BERT model
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Using device: cuda
Sample Record Keys: dict_keys(['text', 'parent_comment', 'article_title', 'article_url', 'platform', 'platform_id', 'composite_toxic'])
Logistic Regression Performance: {'Accuracy': 0.77, 'Precision': 0.723404255319149, 'Recall': 0.16585365853658537, 'F1-Score': 0.2698412698412698}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.4635,0.426922
2,0.4615,0.45483
3,0.1906,0.749847


TrainOutput(global_step=1200, training_loss=0.35891118526458743, metrics={'train_runtime': 162.6892, 'train_samples_per_second': 59.008, 'train_steps_per_second': 7.376, 'total_flos': 631466532864000.0, 'train_loss': 0.35891118526458743, 'epoch': 3.0})

In [9]:
def predict_with_bert(texts, model, tokenizer):
    model.eval()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Move model and inputs to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Ensure inputs are moved to GPU

    with torch.no_grad():
        outputs = model(**inputs)

    return torch.argmax(outputs.logits, dim=1).detach().cpu().numpy()  # Move output back to CPU for NumPy conversion

# Evaluate BERT Model
bert_preds = [predict_with_bert([entry], bert_model, tokenizer)[0] for entry in val_texts]
bert_accuracy = accuracy_score(val_labels, bert_preds)
bert_precision = precision_score(val_labels, bert_preds)
bert_recall = recall_score(val_labels, bert_preds)
bert_f1 = f1_score(val_labels, bert_preds)
print("BERT Model Performance:", {
    "Accuracy": bert_accuracy,
    "Precision": bert_precision,
    "Recall": bert_recall,
    "F1-Score": bert_f1
})

# Load Test Dataset
test_file_path = "/content/z639_assignment1_test.json"
with open(test_file_path, "r") as file:
    json_test_data = [json.loads(line) for line in file]

test_texts = [clean_text(entry['text']) for entry in json_test_data]
platform_ids = [entry['platform_id'] for entry in json_test_data]
X_test = vectorizer.transform(test_texts)

# Predict using BERT
test_preds_bert = [predict_with_bert([text], bert_model, tokenizer)[0] for text in test_texts]

# Save Predictions
submission_df = pd.DataFrame({"platform_id": platform_ids, "prediction": test_preds_bert})
submission_df["prediction"] = submission_df["prediction"].astype(bool)
submission_df.to_csv("/content/Abhir_Iyer-assignment1-prediction.csv", index=False)

print("Predictions saved to Abhir_Iyer-assignment1-prediction.csv")

BERT Model Performance: {'Accuracy': 0.80625, 'Precision': 0.631578947368421, 'Recall': 0.5853658536585366, 'F1-Score': 0.6075949367088608}
Predictions saved to Abhir_Iyer-assignment1-prediction.csv
