In [0]:
!pip install transformers
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 25.7MB/s eta 0:00:01[K     |█▏                              | 20kB 5.9MB/s eta 0:00:01[K     |█▊                              | 30kB 8.2MB/s eta 0:00:01[K     |██▎                             | 40kB 10.1MB/s eta 0:00:01[K     |███                             | 51kB 7.1MB/s eta 0:00:01[K     |███▌                            | 61kB 8.2MB/s eta 0:00:01[K     |████                            | 71kB 9.3MB/s eta 0:00:01[K     |████▋                           | 81kB 10.4MB/s eta 0:00:01[K     |█████▎                          | 92kB 8.5MB/s eta 0:00:01[K     |█████▉                          | 102kB 9.2MB/s eta 0:00:01[K     |██████▍                         | 112kB 9.2MB/s eta 0:00:01[K     |███████                         | 122kB 9.

In [0]:
## hyperparameters

# • Batch size: 16, 32
# • Learning rate (Adam): 5e-5, 3e-5, 2e-5
# • Number of epochs: 2, 3, 4

### using batch size 16
# other configs

# Epoch 1 | Loss - 225.45395208522677 | Time Taken - 6.68 min
# Macro F1 Score 0.9066872060512802
# Macro F1 Score 0.9067430138165765

In [0]:
import time
import torch
import logging
import transformers
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader

model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights='bert-base-uncased'
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
bert_model = model_class.from_pretrained(pretrained_weights)

logging.getLogger("transformers").setLevel(logging.ERROR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_path = '/content/drive/My Drive/Colab Notebooks/jigsaw/train/'
df = pd.read_csv(data_path + 'train.csv')
df = df.drop(['id'], axis = 1)
df = df.sample(frac=1, random_state = 42)

toxic = df[df.toxic == 1]
non_toxic = df[df.toxic != 1]
non_toxic = non_toxic.sample(n = 15000)
df = pd.concat([toxic, non_toxic])
df = df.sample(frac=1, random_state = 42)

train, val, test = df[:20000].values, df[20000:25000].values, df[25000:].values
print('Train Size', train.shape)
print('Val Size', val.shape)
print('Test Size', test.shape)

class ToxicDataset(Dataset):
    def __init__(self, dataframe, max_len):
        self.dataframe = dataframe
        self.max_len = max_len
        self.sep_id = tokenizer.encode(['[SEP]'], add_special_tokens=False)
        self.pad_id = tokenizer.encode(['[PAD]'], add_special_tokens=False)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe[idx]
        text = row[0]
        #targets = torch.tensor(list(row[1:]))
        encoded = tokenizer.encode(text, add_special_tokens=True)[:self.max_len-1]
        if encoded[-1] != self.sep_id[0]:
            encoded = encoded + self.sep_id
        padded = encoded + self.pad_id * (self.max_len - len(encoded))
        padded = torch.tensor(padded)
        labels = torch.Tensor(list(row[1:]))
        return padded, labels


train_dataset = ToxicDataset(train, 84) # why set max-length to 84? longer better or worse?
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

val_dataset = ToxicDataset(val, 84)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

test_dataset = ToxicDataset(test, 84)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

def evaluate(model, data):
    actual, predictions = [], []
    with torch.no_grad():
        for features, targets in data:
            features = features.to(device)
            targets = targets.to(device)
            scores, attentions = model(features)
            sigmoid_out = torch.sigmoid(scores)
            prediction = torch.as_tensor(sigmoid_out > 0.6, dtype=torch.int32) #changing to 0.6 to get more confident predictions to be "1" -- not much diff
            predictions.extend(prediction.view(-1).tolist())
            actual.extend(targets.long().view(-1).tolist())
    assert len(actual) == len(predictions)
    print('Macro F1 Score', f1_score(actual, predictions, average = 'macro'))

class BertNN(nn.Module):
    def __init__(self, hidden_size):
        super(BertNN, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained(pretrained_weights, output_attentions = True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size, 6)

    def forward(self, ex):
        _, pooled_output, attentions = self.bert_model(ex)
        #pooled_output = self.dropout(pooled_output) #why dropout after model output? -- lower scores without dropout of 0.1
        fc_out = self.fc(pooled_output)
        return fc_out, attentions


model = BertNN(768)
model = model.to(device)
loss_function = nn.BCEWithLogitsLoss()
loss_function = loss_function.to(device)




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…


Train Size (20000, 7)
Val Size (5000, 7)
Test Size (5294, 7)


In [0]:
# lrs = [5e-5, 3e-5, 2e-5]
# epochs = [2,3,4]

# for lrate in lrs:
#   for ep in epochs:

#     print("------------------------------")
#     print("---learning_rate:",lrate,"---epoch number:",ep)

#     optimizer = transformers.AdamW(model.parameters(), lr=lrate, correct_bias=False)
#     MAX_EPOCHS = ep

#     max_grad_norm = 1.0
#     warmup_proportion = 0.1
#     num_training_steps  = len(train_dataloader) * MAX_EPOCHS
#     num_warmup_steps = num_training_steps * warmup_proportion
#     scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

#     for epoch in range(MAX_EPOCHS):
#         epoch_loss = 0
#         start_time = time.time()
#         for idx, (features, targets) in enumerate(train_dataloader):
#             model.zero_grad()
#             features = features.to(device)
#             targets = targets.to(device)
#             scores, attentions = model(features)
#             loss = loss_function(scores, targets)
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#             optimizer.step()
#             scheduler.step()
#             epoch_loss += loss.item()
#         if device == 'cuda':
#             torch.cuda.empty_cache()
#         time_taken = round((time.time() - start_time)/60, 2)
#         print(f'Epoch {epoch + 1} | Loss - {epoch_loss} | Time Taken - {time_taken} min')
#         evaluate(model, val_dataloader)

#     evaluate(model, test_dataloader)

In [0]:
# #optimize on other hyperparameters with best config (Batch_size 32, Num_epochs 2 and learning rate 2e-5) at 0.9065)

# grad_caps = [0.7,0.8,0.9,1.0,1.1]
# warmup_props = [0.05,0.1,0.15,0.20]

# for grad_cap in grad_caps:
#   for wp in warmup_props:

#     print("------------------------------")
#     print("grad_cap: ",grad_cap,"warmup ratio:",wp)

#     optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
#     MAX_EPOCHS = 2

#     max_grad_norm = grad_cap
#     warmup_proportion = wp
#     num_training_steps  = len(train_dataloader) * MAX_EPOCHS
#     num_warmup_steps = num_training_steps * warmup_proportion
#     scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

#     for epoch in range(MAX_EPOCHS):
#         epoch_loss = 0
#         start_time = time.time()
#         for idx, (features, targets) in enumerate(train_dataloader):
#             model.zero_grad()
#             features = features.to(device)
#             targets = targets.to(device)
#             scores, attentions = model(features)
#             loss = loss_function(scores, targets)
#             loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#             optimizer.step()
#             scheduler.step()
#             epoch_loss += loss.item()
#         if device == 'cuda':
#             torch.cuda.empty_cache()
#         time_taken = round((time.time() - start_time)/60, 2)
#         print(f'Epoch {epoch + 1} | Loss - {epoch_loss} | Time Taken - {time_taken} min')
#         evaluate(model, val_dataloader)

#     evaluate(model, test_dataloader)



------------------------------
---learning_rate: 5e-05 ---epoch number: 2
Epoch 1 | Loss - 185.50375414825976 | Time Taken - 6.71 min
Macro F1 Score 0.9021034695669485
Epoch 2 | Loss - 114.24120777100325 | Time Taken - 6.74 min
Macro F1 Score 0.906154007852682
Macro F1 Score 0.9039604591274064
------------------------------
---learning_rate: 5e-05 ---epoch number: 3
Epoch 1 | Loss - 113.86767609696835 | Time Taken - 6.76 min
Macro F1 Score 0.8980777158201751
Epoch 2 | Loss - 68.56986253336072 | Time Taken - 6.76 min
Macro F1 Score 0.9024997447787437
Epoch 3 | Loss - 25.431545083178207 | Time Taken - 6.69 min
Macro F1 Score 0.9007347184115244
Macro F1 Score 0.9004183493704224
------------------------------
---learning_rate: 5e-05 ---epoch number: 4
Epoch 1 | Loss - 48.0631898221327 | Time Taken - 6.76 min
Macro F1 Score 0.8937235058912867
Epoch 2 | Loss - 38.893465779023245 | Time Taken - 6.76 min
Macro F1 Score 0.8974896086979957
Epoch 3 | Loss - 15.807909779017791 | Time Taken - 6.7 min
Macro F1 Score 0.899546218039083
Epoch 4 | Loss - 6.546204233251046 | Time Taken - 6.69 min
Macro F1 Score 0.8987044518742549
Macro F1 Score 0.898889561924533
------------------------------
---learning_rate: 3e-05 ---epoch number: 2
Epoch 1 | Loss - 17.139940027467674 | Time Taken - 6.73 min
Macro F1 Score 0.8979094940390212
Epoch 2 | Loss - 9.797589176683687 | Time Taken - 6.7 min
Macro F1 Score 0.8999239356644051
Macro F1 Score 0.8986270303938924
------------------------------
---learning_rate: 3e-05 ---epoch number: 3
Epoch 1 | Loss - 15.056858391268179 | Time Taken - 6.72 min
Macro F1 Score 0.8969136549962933
Epoch 2 | Loss - 9.885369804294896 | Time Taken - 6.7 min
Macro F1 Score 0.8971041623284735
Epoch 3 | Loss - 3.5665759591938695 | Time Taken - 6.67 min
Macro F1 Score 0.898433945807134
Macro F1 Score 0.895936492822745
------------------------------
---learning_rate: 3e-05 ---epoch number: 4
Epoch 1 | Loss - 11.83489630351687 | Time Taken - 6.69 min
Macro F1 Score 0.8945012660625431
Epoch 2 | Loss - 11.014763164821488 | Time Taken - 6.7 min
Macro F1 Score 0.8944186703686314
Epoch 3 | Loss - 4.102971798660292 | Time Taken - 6.68 min
Macro F1 Score 0.8956924293696304
Epoch 4 | Loss - 1.98880013687085 | Time Taken - 6.66 min
Macro F1 Score 0.8967334020570794
Macro F1 Score 0.8978343413588961
------------------------------
---learning_rate: 2e-05 ---epoch number: 2
Epoch 1 | Loss - 5.756832868690253 | Time Taken - 6.68 min
Macro F1 Score 0.8938565575963329
Epoch 2 | Loss - 3.615225807436218 | Time Taken - 6.7 min
Macro F1 Score 0.8972600945110007
Macro F1 Score 0.8995537473728882
------------------------------
---learning_rate: 2e-05 ---epoch number: 3
Epoch 1 | Loss - 6.122836466758599 | Time Taken - 6.71 min
Macro F1 Score 0.8950945192781365
Epoch 2 | Loss - 4.057592681296228 | Time Taken - 6.7 min
Macro F1 Score 0.8981179267253738
Epoch 3 | Loss - 1.6067845029756427 | Time Taken - 6.69 min
Macro F1 Score 0.8990145960456842
Macro F1 Score 0.8965980383958093
------------------------------
---learning_rate: 2e-05 ---epoch number: 4
Epoch 1 | Loss - 5.862004778915434 | Time Taken - 6.69 min
Macro F1 Score 0.8948865503358773
Epoch 2 | Loss - 3.9814947778468195 | Time Taken - 6.69 min
Macro F1 Score 0.8974067506347232
Epoch 3 | Loss - 1.9183375269894896 | Time Taken - 6.68 min
Macro F1 Score 0.8980227122152622
Epoch 4 | Loss - 1.2592462805005198 | Time Taken - 6.67 min
Macro F1 Score 0.8988635298473914
Macro F1 Score 0.8941328374758069

In [0]:
# ##finetuning result on 2 other hyperparameter

# ------------------------------
# grad_cap:  0.7 warmup ratio: 0.05
# Epoch 1 | Loss - 107.26633888110518 | Time Taken - 5.68 min
# Macro F1 Score 0.8936619260990974
# Epoch 2 | Loss - 67.15605465695262 | Time Taken - 5.74 min
# Macro F1 Score 0.9034971220810166
# Macro F1 Score 0.9050485248498495
# ------------------------------
# grad_cap:  0.7 warmup ratio: 0.1
# Epoch 1 | Loss - 64.63833036273718 | Time Taken - 5.76 min
# Macro F1 Score 0.8929998575593225
# Epoch 2 | Loss - 37.07104470767081 | Time Taken - 5.74 min
# Macro F1 Score 0.9043445927002802
# Macro F1 Score 0.9031651974293001
# ------------------------------
# grad_cap:  0.7 warmup ratio: 0.15
# Epoch 1 | Loss - 34.45320533961058 | Time Taken - 5.76 min
# Macro F1 Score 0.901797099109847
# Epoch 2 | Loss - 19.692186129279435 | Time Taken - 5.74 min
# Macro F1 Score 0.9037087511495988
# Macro F1 Score 0.9014698830043779
# ------------------------------
# grad_cap:  0.7 warmup ratio: 0.2
# Epoch 1 | Loss - 18.475861214334145 | Time Taken - 5.74 min
# Macro F1 Score 0.8971559825114002
# Epoch 2 | Loss - 10.991576835745946 | Time Taken - 5.73 min
# Macro F1 Score 0.902774639754621
# Macro F1 Score 0.8991474259894763
# ------------------------------
# grad_cap:  0.8 warmup ratio: 0.05
# Epoch 1 | Loss - 15.10654376912862 | Time Taken - 5.75 min
# Macro F1 Score 0.9001843533731848
# Epoch 2 | Loss - 7.48791551287286 | Time Taken - 5.72 min
# Macro F1 Score 0.9028398845845602
# Macro F1 Score 0.9009134450233417
# ------------------------------
# grad_cap:  0.8 warmup ratio: 0.1
# Epoch 1 | Loss - 10.374420619104058 | Time Taken - 5.74 min
# Macro F1 Score 0.9030680250145007
# Epoch 2 | Loss - 6.229179871734232 | Time Taken - 5.72 min
# Macro F1 Score 0.9047617384038483
# Macro F1 Score 0.899292643921745
# ------------------------------
# grad_cap:  0.8 warmup ratio: 0.15
# Epoch 1 | Loss - 8.383389445574721 | Time Taken - 5.74 min
# Macro F1 Score 0.9013340069201947
# Epoch 2 | Loss - 5.001985712442547 | Time Taken - 5.72 min
# Macro F1 Score 0.9020593471919256
# Macro F1 Score 0.8999045018715068
# ------------------------------
# grad_cap:  0.8 warmup ratio: 0.2
# Epoch 1 | Loss - 6.650678715726826 | Time Taken - 5.73 min
# Macro F1 Score 0.8994794706569655
# Epoch 2 | Loss - 4.880266388820019 | Time Taken - 5.72 min
# Macro F1 Score 0.9011177477805938
# Macro F1 Score 0.8997392516797682
# ------------------------------
# grad_cap:  0.9 warmup ratio: 0.05
# Epoch 1 | Loss - 8.720274956838693 | Time Taken - 5.73 min
# Macro F1 Score 0.9018977326508155
# Epoch 2 | Loss - 4.395385441632243 | Time Taken - 5.71 min
# Macro F1 Score 0.9040334973922209
# Macro F1 Score 0.9012246193274818
# ------------------------------
# grad_cap:  0.9 warmup ratio: 0.1
# Epoch 1 | Loss - 7.5400077268714085 | Time Taken - 5.72 min
# Macro F1 Score 0.9009305656784353
# Epoch 2 | Loss - 3.6871492758218665 | Time Taken - 5.71 min
# Macro F1 Score 0.9019291870695412
# Macro F1 Score 0.8985438406056665
# ------------------------------
# grad_cap:  0.9 warmup ratio: 0.15
# Epoch 1 | Loss - 5.497639556037029 | Time Taken - 5.72 min
# Macro F1 Score 0.8982489473866462
# Epoch 2 | Loss - 2.7703927679976914 | Time Taken - 5.71 min
# Macro F1 Score 0.8992530032252194
# Macro F1 Score 0.8982796102199433
# ------------------------------
# grad_cap:  0.9 warmup ratio: 0.2
# Epoch 1 | Loss - 4.724318902663072 | Time Taken - 5.73 min
# Macro F1 Score 0.9006461370985697
# Epoch 2 | Loss - 3.402612740181212 | Time Taken - 5.72 min
# Macro F1 Score 0.9031339112406707
# Macro F1 Score 0.8995456641791317
# ------------------------------
# grad_cap:  1.0 warmup ratio: 0.05
# Epoch 1 | Loss - 7.131069630733691 | Time Taken - 5.74 min
# Macro F1 Score 0.9000607861865744
# Epoch 2 | Loss - 2.9061441709054634 | Time Taken - 5.72 min
# Macro F1 Score 0.9021898976006097
# Macro F1 Score 0.9006945421406302
# ------------------------------
# grad_cap:  1.0 warmup ratio: 0.1
# Epoch 1 | Loss - 5.95309567694494 | Time Taken - 5.73 min
# Macro F1 Score 0.8998067655583233
# Epoch 2 | Loss - 2.511273366457317 | Time Taken - 5.72 min
# Macro F1 Score 0.903417640991338
# Macro F1 Score 0.9008547674938299
# ------------------------------
# grad_cap:  1.0 warmup ratio: 0.15
# Epoch 1 | Loss - 4.612910380543326 | Time Taken - 5.73 min
# Macro F1 Score 0.8989332171786292
# Epoch 2 | Loss - 2.9027357136656065 | Time Taken - 5.73 min
# Macro F1 Score 0.9021860735986387
# Macro F1 Score 0.900064222790183
# ------------------------------
# grad_cap:  1.0 warmup ratio: 0.2
# Epoch 1 | Loss - 4.000405219419918 | Time Taken - 5.75 min
# Macro F1 Score 0.8978901432459965
# Epoch 2 | Loss - 2.5934266846670653 | Time Taken - 5.74 min
# Macro F1 Score 0.9005788494435067
# Macro F1 Score 0.8997090852576312
# ------------------------------
# grad_cap:  1.1 warmup ratio: 0.05
# Epoch 1 | Loss - 6.177263775025494 | Time Taken - 5.76 min
# Macro F1 Score 0.8990900582253667
# Epoch 2 | Loss - 2.3012431833485607 | Time Taken - 5.73 min
# Macro F1 Score 0.9009714978994641
# Macro F1 Score 0.8997612023395669
# ------------------------------
# grad_cap:  1.1 warmup ratio: 0.1
# Epoch 1 | Loss - 4.6941000642837025 | Time Taken - 5.75 min
# Macro F1 Score 0.9004683030924514
# Epoch 2 | Loss - 2.2402445025072666 | Time Taken - 5.73 min
# Macro F1 Score 0.9016907040025756
# Macro F1 Score 0.8979497807415765
# ------------------------------
# grad_cap:  1.1 warmup ratio: 0.15
# Epoch 1 | Loss - 4.88284892111551 | Time Taken - 5.74 min
# Macro F1 Score 0.8997517563870083
# Epoch 2 | Loss - 2.217922415533394 | Time Taken - 5.72 min
# Macro F1 Score 0.9007822324817372
# Macro F1 Score 0.897922467059006
# ------------------------------
# grad_cap:  1.1 warmup ratio: 0.2
# Epoch 1 | Loss - 3.752467484206136 | Time Taken - 5.72 min
# Macro F1 Score 0.8996026403701827
# Epoch 2 | Loss - 1.9224082648433978 | Time Taken - 5.71 min
# Macro F1 Score 0.8997154477950396
# Macro F1 Score 0.8987948909936838

In [0]:

# best model
# with grad_cap:  0.7 warmup ratio: 0.05 from best test acc  ==>testset Macro F1 Score 0.9050637842441922
## with original grad_cap and warmup ratio-- test score 0.9008

## best result after bestcombo -- test Macro F1 Score 0.9063973462017885

optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
MAX_EPOCHS = 2

max_grad_norm = 0.7
warmup_proportion = 0.05
num_training_steps  = len(train_dataloader) * MAX_EPOCHS
num_warmup_steps = num_training_steps * warmup_proportion
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

for epoch in range(MAX_EPOCHS):
    epoch_loss = 0
    start_time = time.time()
    for idx, (features, targets) in enumerate(train_dataloader):
        model.zero_grad()
        features = features.to(device)
        targets = targets.to(device)
        scores, attentions = model(features)
        loss = loss_function(scores, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
    if device == 'cuda':
        torch.cuda.empty_cache()
    time_taken = round((time.time() - start_time)/60, 2)
    print(f'Epoch {epoch + 1} | Loss - {epoch_loss} | Time Taken - {time_taken} min')
    evaluate(model, val_dataloader)

    evaluate(model, test_dataloader)

Epoch 1 | Loss - 116.66352342069149 | Time Taken - 3.0 min
Macro F1 Score 0.9005833317962711
Macro F1 Score 0.9056820254954276
Epoch 2 | Loss - 68.99307287856936 | Time Taken - 3.0 min
Macro F1 Score 0.9025019635208984
Macro F1 Score 0.9063973462017885


In [0]:
data = test_dataloader

actual, predictions = [], []
with torch.no_grad():
    for features, targets in data:
        features = features.to(device)
        targets = targets.to(device)
        scores, attentions = model(features)
        sigmoid_out = torch.sigmoid(scores)
        prediction = torch.as_tensor(sigmoid_out > 0.5, dtype=torch.int32)

        print(features)
        print(targets)
        print(prediction.view(-1).tolist())
        break
#         predictions.extend(prediction.view(-1).tolist())
#         actual.extend(targets.long().view(-1).tolist())
# assert len(actual) == len(predictions)
# print('Macro F1 Score', f1_score(actual, predictions, average = 'macro'))

tensor([[  101,  3531,  5607,  ...,     0,     0,     0],
        [  101,  2036,  1010,  ...,     0,     0,     0],
        [  101,  2017, 11891,  ...,     0,     0,     0],
        ...,
        [  101,  1000,  3241,  ...,  2033,  2065,   102],
        [  101,  1005,  1005,  ...,  1005,  1005,   102],
        [  101,  2026, 12997,  ...,     0,     0,     0]], device='cuda:0')
tensor([[1., 0., 1., 1., 1., 0.],
        [1., 0., 0., 0., 1., 0.],
        [1., 0., 1., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 1., 1.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [

In [0]:
len(prediction)
prediction[0]

tensor([1, 0, 0, 1, 0, 0], dtype=torch.int32)

In [0]:
for i, feat in enumerate(features):
  #print(tokenizer.convert_ids_to_tokens(feat))
  full_strs = tokenizer.convert_ids_to_tokens(feat)

  full_str = ''
  for strs in full_strs:
    if strs not in ['[PAD]','[SEP]','[CLS]']:
      full_str += strs + " "
  print(full_str)

  print(prediction[i])
  print(targets[i])


please shoot yourself please put yourself out of your misery . either shoot or hang yourself you depressed freak . 
tensor([1, 0, 0, 1, 0, 0], dtype=torch.int32)
tensor([1., 0., 1., 1., 1., 0.], device='cuda:0')
also , my mother is a prostitute who loves her black pi ##mp . 
tensor([1, 0, 0, 0, 1, 1], dtype=torch.int32)
tensor([1., 0., 0., 0., 1., 0.], device='cuda:0')
you suck hardcore yep you heard it 
tensor([1, 0, 1, 0, 1, 0], dtype=torch.int32)
tensor([1., 0., 1., 0., 1., 0.], device='cuda:0')
" honestly ? you all get your pages to talk about your railroad fe ##tish and your " " awards " " , but when so ##mon ##e actually does something real , and someone wants to bring attention to that persons accomplishments , you ne ##rds attempt to block it ? this is communist . " 
tensor([1, 0, 0, 0, 0, 0], dtype=torch.int32)
tensor([1., 0., 0., 0., 0., 0.], device='cuda:0')
i am a bad egg i am a bad egg i am a bad egg i am a bad egg i am a bad egg i am a bad egg i am a bad egg i am a bad eg

In [0]:
print(len(features))
print(len(prediction))


32
32


In [0]:
## test on real kaggle test set

testdata_path = '/content/drive/My Drive/Colab Notebooks/jigsaw/test/'
ori_testdf = pd.read_csv(testdata_path + 'test.csv')



print(len(ori_testdf)) #153164


153164


In [0]:
label_path = '/content/drive/My Drive/Colab Notebooks/jigsaw/test_labels/'
labeldf = pd.read_csv(label_path + 'test_labels.csv')



In [0]:
len(labeldf)

153164

In [0]:
correct_index = labeldf[labeldf['toxic']!=-1].index.tolist()

data_for_pred = ori_testdf.iloc[correct_index]

In [0]:
testdf = data_for_pred.drop(['id'], axis = 1)

In [0]:
split1, split2, split3 = testdf[:5000].values, testdf[20000:25000].values, testdf[25000:30000].values

split1_dataset = ToxicDataset(split1, 84) 
split1_dataloader = DataLoader(split1_dataset, batch_size=32, shuffle=False, num_workers=4)

split2_dataset = ToxicDataset(split2, 84)
split2_dataloader = DataLoader(split2_dataset, batch_size=32, shuffle=False, num_workers=4)

split3_dataset = ToxicDataset(split3, 84)
split3_dataloader = DataLoader(split3_dataset, batch_size=32, shuffle=False, num_workers=4)



In [0]:
split1

array([['Thank you for understanding. I think very highly of you and would not revert without discussion.'],
       [':Dear god this site is horrible.'],
       ['"::: Somebody will invariably try to add Religion?  Really??  You mean, the way people have invariably kept adding ""Religion"" to the Samuel Beckett infobox?  And why do you bother bringing up the long-dead completely non-existent ""Influences"" issue?  You\'re just flailing, making up crap on the fly. \n ::: For comparison, the only explicit acknowledgement in the entire Amos Oz article that he is personally Jewish is in the categories!    \n\n "'],
       ...,
       ['" \n\n :Casaforra... WHO HAS EVER DENIED VALENCIAN OR MAJORCAN TO BE DIFFERENT FROM CATALAN???? WHO???? NO ONE! You\'re so obsessed with your catalan-countries that you don\'t take the time to read others comments. What we only say is that people from valencia call what they speak valencian and people from majorca call what they speak majorcan. You agreed on

In [0]:
#check the first 5000 comments

data = split1_dataloader

actual, predictions = [], []
with torch.no_grad():
    for features, targets in data:
        features = features.to(device)
        #targets = targets.to(device)
        scores, attentions = model(features)
        sigmoid_out = torch.sigmoid(scores)
        prediction = torch.as_tensor(sigmoid_out > 0.5, dtype=torch.int32)

        #print(features)
        #print(targets)
        for i, feat in enumerate(features):
          #print(features[i])
          #print(prediction[i])
          #print(prediction[i].view(-1).tolist())
          predictions.append(prediction[i].view(-1).tolist())


        #print(prediction.view(-1).tolist())
        #predictions.extend(prediction.view(-1).tolist())
        #actual.extend(targets.long().view(-1).tolist())

In [0]:
len(predictions)

5000

In [0]:
len(correct_index)
label_5000 = [ind for ind in correct_index if ind < 5000]
len(label_5000)

2069

In [0]:
pred5000_df=pd.DataFrame(predictions)
pred5000_df

pred5000_for_f1 = pred5000_df.iloc[label_5000]

preds = []
for i in range(len(pred5000_for_f1)):
  preds.extend(pred5000_for_f1.iloc[i].tolist())

len(preds)

12414

In [0]:
len(pred5000_for_f1) #2069

actual_5000=labeldf[labeldf['toxic']!=-1].iloc[label_5000]
actual_5000_noid = actual_5000.drop(columns=["id"])

actual_f1 = []
for i in range(len(actual_5000_noid)):
  actual_f1.extend(actual_5000_noid.iloc[i].tolist())

len(actual_f1) #2069×6 - 12414


12414

In [0]:
actual_5000

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
16,000663aff0fffc80,0,0,0,0,0,0
19,000844b52dee5f3f,0,0,0,0,0,0
26,000a02d807ae0254,0,0,0,0,0,0
29,000bf0a9894b2807,0,0,0,0,0,0
31,000c9b92318552d1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
12029,1435bb7803d32a30,0,0,0,0,0,0
12035,14384ee62f85e67a,0,0,0,0,0,0
12036,143866f454ded075,0,0,0,0,0,0
12040,14397283843e28c0,0,0,0,0,0,0


In [0]:
print('Macro F1 Score', f1_score(actual_f1, preds, average = 'macro'))

Macro F1 Score 0.7846581091598825


In [0]:
# check full test data

fulltest_dataset = ToxicDataset(testdf.values, 84) 
fulltest_dataloader = DataLoader(fulltest_dataset, batch_size=32, shuffle=False, num_workers=4)

data = fulltest_dataloader

actual, predictions = [], []
with torch.no_grad():
    for features, targets in data:
        features = features.to(device)
        #targets = targets.to(device)
        scores, attentions = model(features)
        sigmoid_out = torch.sigmoid(scores)
        prediction = torch.as_tensor(sigmoid_out > 0.5, dtype=torch.int32)

        #print(features)
        #print(targets)
        for i, feat in enumerate(features):
          #print(features[i])
          #print(prediction[i])
          #print(prediction[i].view(-1).tolist())
          predictions.append(prediction[i].view(-1).tolist())

print(len(predictions))

pred_df=pd.DataFrame(predictions)

preds = []
for i in range(len(pred_df)):
  preds.extend(pred_df.iloc[i].tolist())

print(len(preds))


63978
383868


In [0]:
actual_df=labeldf[labeldf['toxic']!=-1]
actual_df_noid = actual_df.drop(columns=["id"])

actual_f1 = []
for i in range(len(actual_df_noid)):
  actual_f1.extend(actual_df_noid.iloc[i].tolist())

len(actual_f1)

383868

In [0]:
print('Macro F1 Score', f1_score(actual_f1, preds, average = 'macro'))

Macro F1 Score 0.7881645102550523


In [0]:
ori_testdf

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [0]:
## for kaggle submission -- only sigmoid values are needed 
## So instead of the 1 or 0 you have to just use the sigmoid values which are essentially probabilities for every label


testdf = ori_testdf.drop(['id'], axis = 1)

fulltest_dataset = ToxicDataset(testdf.values, 84) 
fulltest_dataloader = DataLoader(fulltest_dataset, batch_size=32, shuffle=False, num_workers=4)

data = fulltest_dataloader

actual, predictions = [], []
with torch.no_grad():
    for features, targets in data:
        features = features.to(device)
        #targets = targets.to(device)
        scores, attentions = model(features)
        sigmoid_out = torch.sigmoid(scores)
        #print(sigmoid_out)
        #prediction = torch.as_tensor(sigmoid_out > 0.5, dtype=torch.int32)

        #print(features)
        #print(targets)
        for i, feat in enumerate(features):
          #print(features[i])
          #print(prediction[i])
          #print(prediction[i].view(-1).tolist())
          predictions.append(sigmoid_out[i].view(-1).tolist())

print(len(predictions))

pred_df=pd.DataFrame(predictions)

preds = []
for i in range(len(pred_df)):
  preds.extend(pred_df.iloc[i].tolist())

print(len(preds))


153164
918984


In [0]:
len(pred_df)


153164

In [0]:
pred_df['id']= ori_testdf['id']

In [0]:
pred_df = pred_df[['id',0,1,2,3,4,5]]

In [0]:
pred_df.head()

Unnamed: 0,id,0,1,2,3,4,5
0,00001cee341fdb12,0.998539,0.424346,0.985993,0.054958,0.965682,0.118837
1,0000247867823ef7,0.005951,0.003443,0.003635,0.003572,0.002988,0.003278
2,00013b17ad220c46,0.01053,0.002916,0.004239,0.002599,0.003059,0.002375
3,00017563c3f7919a,0.004761,0.003854,0.00354,0.004133,0.003304,0.004066
4,00017695ad8997eb,0.009394,0.002896,0.003643,0.003736,0.002846,0.002803


In [0]:
pred_df.columns = ["id","toxic","severe_toxic","obscene","threat","insult","identity_hate"]

In [0]:
pred_df.to_csv('kaggle_submission.csv',index=False)

In [0]:
pred_df.shape

(153164, 7)

In [0]:
pd.read_csv('kaggle_submission.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998539,0.424346,0.985993,0.054958,0.965682,0.118837
1,0000247867823ef7,0.005951,0.003443,0.003635,0.003572,0.002988,0.003278
2,00013b17ad220c46,0.01053,0.002916,0.004239,0.002599,0.003059,0.002375
3,00017563c3f7919a,0.004761,0.003854,0.00354,0.004133,0.003304,0.004066
4,00017695ad8997eb,0.009394,0.002896,0.003643,0.003736,0.002846,0.002803
