## 1- BERT Algo

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.nn import BCEWithLogitsLoss
from tqdm import tqdm
import os

# 1. Load the Data
train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
test_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv')
test_labels = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train_data['comment_text'] = train_data['comment_text'].fillna("")

label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

y_train = train_data[label_columns].values

# 2. Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            comment,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'comment_text': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [6]:

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['comment_text'].values,
    y_train,
    test_size=0.2,
    random_state=42
)

train_dataset = ToxicCommentsDataset(
    comments=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = ToxicCommentsDataset(
    comments=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    max_len=128
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)



In [7]:
# 3. Define the BERT-based model for multi-label classification
class ToxicCommentClassifier(nn.Module):
    def __init__(self):
        super(ToxicCommentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(label_columns))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        return self.classifier(dropout_output)



In [8]:
model = ToxicCommentClassifier()
model = model.to('cuda') if torch.cuda.is_available() else model

# 4. Set up the optimizer, loss function, and training loop
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
criterion = BCEWithLogitsLoss()



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [9]:
def train_epoch(model, data_loader, optimizer, criterion, device):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = criterion(outputs, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return sum(losses) / len(losses)



In [10]:
def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    losses = []
    preds = []
    labels_list = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            labels = d['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = criterion(outputs, labels)
            losses.append(loss.item())

            preds.append(outputs.sigmoid().cpu().numpy())
            labels_list.append(labels.cpu().numpy())

    # Flatten predictions and true labels
    preds = np.vstack(preds)
    labels_list = np.vstack(labels_list)

    return sum(losses) / len(losses), preds, labels_list



In [11]:
# 5. Training the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Training loss: {train_loss}')

    val_loss, preds, true_labels = eval_model(model, val_loader, criterion, device)
    print(f'Validation loss: {val_loss}')
    roc_auc = roc_auc_score(true_labels, preds, average='macro')
    print(f'Validation ROC AUC: {roc_auc}')

# 6. Testing on test.csv 
test_dataset = ToxicCommentsDataset(
    comments=test_data['comment_text'].values,
    labels=test_labels[label_columns].values,
    tokenizer=tokenizer,
    max_len=128
)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
test_loss, test_preds, test_true_labels = eval_model(model, test_loader, criterion, device)
test_roc_auc = roc_auc_score(test_true_labels, test_preds, average='macro')

print(f'Test ROC AUC: {test_roc_auc}')


Epoch 1/3


100%|██████████| 7979/7979 [47:16<00:00,  2.81it/s]


Training loss: 0.046857573849172156
Validation loss: 0.0395107005447392
Validation ROC AUC: 0.9899252914190549
Epoch 2/3


100%|██████████| 7979/7979 [46:54<00:00,  2.83it/s]


Training loss: 0.03456278267795925
Validation loss: 0.03864695149338757
Validation ROC AUC: 0.9908482168514022
Epoch 3/3


100%|██████████| 7979/7979 [46:51<00:00,  2.84it/s]


Training loss: 0.02772465908771305
Validation loss: 0.04304604033522604
Validation ROC AUC: 0.9875961618719756


ValueError: multiclass-multioutput format is not supported

# 2-Logistic Regression

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

sample_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X = train_data['comment_text']
y = train_data[target_cols]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

logreg = OneVsRestClassifier(LogisticRegression(max_iter=1000))
logreg.fit(X_train_tfidf, y_train)

y_val_pred_proba = logreg.predict_proba(X_val_tfidf)
y_val_pred = logreg.predict(X_val_tfidf)

accuracy = accuracy_score(y_val, (y_val_pred > 0.5).astype(int))
roc_auc = roc_auc_score(y_val, y_val_pred_proba, average='macro')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation ROC-AUC: {roc_auc}')

X_test = test_data['comment_text']
X_test_tfidf = tfidf.transform(X_test)
y_test_pred_proba = logreg.predict_proba(X_test_tfidf)

sample_submission[target_cols] = y_test_pred_proba

sample_submission.to_csv('logreg_submission.csv', index=False)


Validation Accuracy: 0.918972270092433
Validation ROC-AUC: 0.9765039152071612


# 3- Random Forest Classifier

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X = train_data['comment_text']
y = train_data[target_cols]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
rf.fit(X_train_tfidf, y_train)

y_val_pred_proba = rf.predict_proba(X_val_tfidf)
y_val_pred = rf.predict(X_val_tfidf)

accuracy = accuracy_score(y_val, (y_val_pred > 0.5).astype(int))
roc_auc = roc_auc_score(y_val, y_val_pred_proba, average='macro')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation ROC-AUC: {roc_auc}')

X_test = test_data['comment_text']
X_test_tfidf = tfidf.transform(X_test)
y_test_pred_proba = rf.predict_proba(X_test_tfidf)


Validation Accuracy: 0.9162462791790694
Validation ROC-AUC: 0.9545966884944419


FileNotFoundError: [Errno 2] No such file or directory: 'sample_submission.csv'

In [10]:
sample_submission[target_cols] = y_test_pred_proba

sample_submission.to_csv('rf_binary_relevance_submission.csv', index=False)
