In [1]:
!pip install -q watermark

[0m

In [2]:
%load_ext watermark
%watermark -p torch,transformers,pandas

torch       : 1.13.0+cpu
transformers: 4.27.4
pandas      : 1.3.5



In [3]:
import re
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# set seed
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7973de20b0d0>

In [5]:
# set device
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print("Device:", DEVICE)

Device: cpu


In [6]:
# Model
MODEL_CKPT = 'roberta-base'
MAX_LEN = 320
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
EPOCHS = 5
LEARNING_RATE = 2e-5
THRESHOLD = 0.7

In [7]:
FOR_SUBMISSION = True

In [8]:
train_data = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
print("Num. samples:", len(train_data))

Num. samples: 159571


In [9]:
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_data['labels'] = train_data[label_columns].apply(lambda x: list(x), axis=1)

train_data.drop(['id'], inplace=True, axis=1)
train_data.drop(label_columns, inplace=True, axis=1)

train_data.head(3)

Unnamed: 0,comment_text,labels
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"


In [10]:
def clean_text(txt):
    txt = txt.lower()  # convert to lowercase
    txt = re.sub(r'\d+', '', txt)  # remove numbers
    txt = re.sub(r'[^\w\s]', '', txt)  # remove punctuation
    txt = re.sub(r'\s+', ' ', txt)  # remove extra spaces
    return txt.strip()

train_data['comment_text'] = train_data['comment_text'].apply(lambda x: clean_text(x))

In [11]:
# split data
train_size = 0.85
train_df = train_data.sample(frac=train_size, random_state=123)
val_df = train_data.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print("Orig Dataset: {}".format(train_data.shape))
print("Training Dataset: {}".format(train_df.shape))
print("Validation Dataset: {}".format(val_df.shape))

Orig Dataset: (159571, 2)
Training Dataset: (135635, 2)
Validation Dataset: (23936, 2)


In [12]:
# create dataset and dataloaders
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, new_data=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.text = dataframe.comment_text
        self.new_data = new_data
        self.max_len = max_len
        
        if not new_data:
            self.targets = self.data.labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        text = clean_text(text)

        inputs = self.tokenizer(
            text, 
            truncation=True, 
            padding='max_length' if self.new_data else False,
            max_length=self.max_len, 
            return_tensors="pt"
        )
        inputs = {k: v.squeeze() for k, v in inputs.items()}
        
        if not self.new_data:
            labels = torch.tensor(self.targets[index], dtype=torch.float)
            return inputs, labels

        return inputs



In [14]:
train_dataset = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_dataset = MultiLabelDataset(val_df, tokenizer, MAX_LEN)

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE, num_workers=4, pin_memory=True)

val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=VALID_BATCH_SIZE, num_workers=4, pin_memory=True)

print("Train Dataset:", len(train_dataset))
print("Validation Dataset:", len(val_dataset))

Train Dataset: 135635
Validation Dataset: 23936


In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT, do_lower_case=True)

train_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)

In [16]:
def dynamic_collate(data):
    """Custom data collator for dynamic padding."""
    inputs = [d for d,l in data]
    labels = torch.stack([l for d,l in data], dim=0)
    inputs = tokenizer.pad(inputs, return_tensors='pt')
    return inputs, labels

In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2, 
                'collate_fn': dynamic_collate}

val_params = {'batch_size': VALID_BATCH_SIZE,
              'shuffle': False,
              'num_workers': 2, 
              'collate_fn': dynamic_collate}

train_loader = DataLoader(train_set, **train_params)
val_loader = None if FOR_SUBMISSION else DataLoader(val_set, **val_params)

In [18]:
class TransformerModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_CKPT)
        self.classifier = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(768, 6)
        )

    def forward(self, inputs):
        roberta_output = self.roberta(**inputs)
        hidden_state = roberta_output.last_hidden_state
        pooled_out = hidden_state[:, 0]
        logits = self.classifier(pooled_out)
        return logits



model = TransformerModel()
model.to(DEVICE);

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.BCEWithLogitsLoss()

In [20]:
lr_sched = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.2)
test_df = pd.read_csv('/kaggle/input/tf2chatsunlabelled/chatlog.csv')
test_df.head(80)

Unnamed: 0,steamid,name,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,[U:1:158517868],Orange-Juice,bad spy,,,,,,
1,[U:1:158517868],Orange-Juice,:(,,,,,,
2,[U:1:236710169],CUNNY PLEASE,loll,,,,,,
3,[U:1:174480808],kris,gg,,,,,,
4,[U:1:236710169],CUNNY PLEASE,insane,,,,,,
...,...,...,...,...,...,...,...,...,...
75,[U:1:378260322],AG,sb,,,,,,
76,[U:1:378260322],AG,sb,,,,,,
77,[U:1:1210179367],小灰灰本人,6,,,,,,
78,[U:1:1210179367],小灰灰本人,.SS,,,,,,


In [21]:
def accuracy_multi(inp, targ, thresh=0.5, sigmoid=True):
    """An accuracy metric for multi-label problems."""
    if sigmoid: 
        inp = inp.sigmoid()
    return ((inp > thresh) == targ.bool()).float().mean()

In [22]:
def train_one_epoch(train_loader, model, loss_func, optimizer, progress_bar=None):
    """Train model over one epoch."""
    model.train()
    size = len(train_loader.dataset)  # Train set size
    
    for i, (data, targets) in enumerate(train_loader):
        # Put inputs and target on DEVICE
        data = {k: v.to(DEVICE) for k, v in data.items()}
        targets = targets.to(DEVICE)
        
        outputs = model(data)
        loss = loss_func(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if progress_bar is not None:
            progress_bar.update(1)
        
        if i % 1000 == 0:
            loss, step = loss.item(), i * len(targets)
            print(f"Loss: {loss:>4f}  [{step:>6d}/{size:>6d}]")
        elif i == len(train_loader) - 1:
            loss = loss.item()
            print(f"Loss: {loss:>4f}  [{size:>6d}/{size:>6d}]")

In [23]:
def validate_one_epoch(val_loader, model, loss_func):
    """Validate model over one epoch."""
    model.eval()
    num_batches = len(val_loader)
    
    valid_loss, acc_multi = 0, 0

    with torch.no_grad():
        for _, (data, targets) in enumerate(val_loader):
            data = {k: v.to(DEVICE) for k, v in data.items()}
            targets = targets.to(DEVICE)

            outputs = model(data)
            valid_loss += loss_func(outputs, targets).item()
            acc_multi += accuracy_multi(outputs, targets)

    valid_loss /= num_batches  # Avg. loss
    acc_multi /= num_batches   # Avg. acc. multi
    print(f"Avg. valid. loss: {valid_loss:>4f}, Acc. multi: {acc_multi:>4f}\n")

In [None]:
num_train_steps = EPOCHS * len(train_loader)
progress_bar = tqdm(range(num_train_steps))

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1} (lr = {lr_sched.get_last_lr()[0]:.2e})\n-------------------------------")
    train_one_epoch(train_loader, model, loss_func, optimizer, progress_bar)
    if not FOR_SUBMISSION:
        validate_one_epoch(val_loader, model, loss_func)
    lr_sched.step()

  0%|          | 0/21195 [00:00<?, ?it/s]

Epoch 1 (lr = 2.00e-05)
-------------------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Loss: 0.706802  [     0/135635]


In [None]:
test_df = pd.read_csv('/kaggle/input/tf2chatsunlabelled/chatlog.csv')

test_df.head(3)

In [None]:
test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 2}

test_set = MultiLabelDataset(test_df, tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, **test_params)

In [None]:
def predict(test_loader, model):
    """Make predictions on test set."""
    model.eval()
    all_preds = []
    
    with torch.inference_mode():
        for data in tqdm(test_loader):
            data = {k: v.to(DEVICE) for k, v in data.items()}

            outputs = model(data)
            probas = torch.sigmoid(outputs)

            all_preds.append(probas)
            
        all_preds = torch.cat(all_preds)
    return all_preds.cpu()

In [None]:
all_test_pred = predict(test_loader, model)

In [None]:
submit_df = test_df.copy()

In [None]:
for i, name in enumerate(label_columns):
    submit_df[name] = all_test_pred[:, i]

submit_df.head()

In [None]:




submit_df.to_csv('Chatslabelled.csv', index=False)