In [2]:
import numpy as np # to handle matrix and data operation
import pandas as pd # to read csv and handle dataframe

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable

from sklearn.model_selection import train_test_split

from torchnlp.word_to_vector import FastText

from torch.nn.utils.rnn import pad_sequence

In [3]:
device = torch.device("cuda:0")

In [4]:
df = pd.read_csv("toxic-train-kaggle-clean.csv")
df["word_splits"] = df["word_splits"].apply(eval)
df = df[(df["word_splits"].apply(len) > 0) & (df["word_splits"].apply(len) <= 560)]

In [5]:
PAD_LENGTH = max(df["word_splits"].apply(len))
PAD_LENGTH

560

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df["word_splits"], df.drop("word_splits", axis=1), test_size=0.15)

In [7]:
vectors = FastText()

In [8]:
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS

class ToxicClassifierModel(nn.Module):
    def __init__(self):
        super(ToxicClassifierModel, self).__init__()
        self.BiGRU = nn.GRU(300, hidden_size = LSTM_UNITS, bidirectional=True, num_layers=1)
        self.BiRNN = nn.RNN(input_size = 2 * LSTM_UNITS, hidden_size = LSTM_UNITS, bidirectional=True)
        self.hidden1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.hidden2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.hidden3 = nn.Linear(DENSE_HIDDEN_UNITS, 6)
        self.vectors = FastText()
    
    def forward(self, X):
        depth = X.size()[0]
        word_num = X.shape[1]
        word_emb = X.shape[2]
        
        #print("input:", X.shape)
        
        X = X.permute(0, 2, 1)
        X = F.dropout2d(X, 0.2, training=self.training)
        X = X.permute(0, 2, 1)
        
        #print("Spacial:", X.shape)
        
        X = self.BiGRU(X)
        #print("GRU0:", X[0].shape)
        # print("GRU1:", X[1].shape)
        
        X = self.BiRNN(X[0])
        #print("RNN0:", X[0].shape)
        # print("RNN1:", X[1].shape)
        
        X = X[0]
        # X = X[0].permute(0, 2, 1)
        
        # print(torch.max(X, 1))
        
        # print("Max pool:", torch.max(X, 1).values.shape)
        # print("Avg pool:", torch.mean(X, 1).shape)
        
        X = torch.cat((torch.max(X, 1).values, torch.mean(X, 1)), 1)
        
        #print("cat:", X.shape)
        
        X = X.add(F.relu(self.hidden1(X)))
        
        #print("Dense1:", X.shape)
        
        X = X.add(F.relu(self.hidden2(X)))
        
        #print("Dense2:", X.shape)
        
        X = torch.sigmoid(self.hidden3(X))
        
        #print("Out:", X.shape)
        
        return X

In [9]:
TCM = ToxicClassifierModel()

In [10]:
X_train = X_train.values
y_train = y_train.values

X_test = X_test.values
y_test = y_test.values

In [11]:
BATCH_SIZE = 64

batched_X_train = []
batched_y_train = []

i=0
while (i+1) * BATCH_SIZE < len(X_train):
    batched_X_train.append(X_train[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
    batched_y_train.append(y_train[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
    i+=1
batched_X_train.append(X_train[i*BATCH_SIZE:])
batched_y_train.append(y_train[i*BATCH_SIZE:])

batched_X_test = []
batched_y_test = []

del X_train
del y_train

i=0
while (i+1) * BATCH_SIZE < len(X_test):
    batched_X_test.append(X_test[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
    batched_y_test.append(y_test[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
    i+=1
batched_X_test.append(X_test[i*BATCH_SIZE:])
batched_y_test.append(y_test[i*BATCH_SIZE:])

del X_test
del y_test

In [None]:
model = TCM
optimizer = torch.optim.Adam(model.parameters())#,lr=0.001, betas=(0.9,0.999))
batch_idx, (X_batch, y_batch) = list(enumerate(zip(batched_X_train, batched_y_train)))[0]
var_X_batch = Variable(torch.nn.utils.rnn.pad_sequence([ vectors[X] for X in X_batch]).permute(1,0,2)).float()
var_y_batch = Variable(torch.from_numpy(y_batch))
optimizer.zero_grad()
output = model(var_X_batch)

In [53]:
output.size()

torch.Size([128, 6])

In [54]:
output[:,5].size()

torch.Size([128])

In [59]:
error = nn.BCELoss()
loss = error(output, var_y_batch.float())

In [29]:
def fit(model, batched_X_train, batched_y_train):
    global acc
    optimizer = torch.optim.Adam(model.parameters())#,lr=0.001, betas=(0.9,0.999))
    error = nn.BCELoss()
    EPOCHS = 5
    model.train()
    for epoch in range(EPOCHS):
        acc2 = acc
        correct = 0
        for batch_idx, (X_batch, y_batch) in enumerate(zip(batched_X_train, batched_y_train)):
            var_X_batch = Variable(torch.nn.utils.rnn.pad_sequence([ vectors[X] for X in X_batch]).permute(1,0,2)).float().to(device)
            var_y_batch = Variable(torch.from_numpy(y_batch)).float().to(device)
            optimizer.zero_grad()
            output = model(var_X_batch)
            loss = error(output, var_y_batch)
            loss.backward()
            optimizer.step()

            # Total correct predictions
            predicted = output.data.round()
            correct += (predicted == var_y_batch).sum()
            #print(correct)
            if batch_idx % 50 == 0:
                print('Epoch : {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy:{:.3f}%'.format(
                    epoch, batch_idx*len(X_batch), len(batched_X_train), 100.*batch_idx / len(batched_X_train), loss.data, float(correct*100) / float(6 * BATCH_SIZE*(batch_idx+1))))
            del var_X_batch
            del var_y_batch
            del loss
            del output
            del predicted
            torch.cuda.empty_cache()
        
        correct = 0
        for batch_idx, (X_batch, y_batch) in enumerate(zip(batched_X_test, batched_y_test)):
            var_X_batch = Variable(torch.nn.utils.rnn.pad_sequence([ vectors[X] for X in X_batch]).permute(1,0,2)).float().to(device)
            var_y_batch = Variable(torch.from_numpy(y_batch)).float().to(device)
            output = TCM(var_X_batch)

            # Total correct predictions
            predicted = output.data.round()
            correct += (predicted == var_y_batch).sum()
            del var_X_batch
            del var_y_batch
            del output
            del predicted
            torch.cuda.empty_cache()
        
        acc = float(correct*100) / float(6 * BATCH_SIZE * len(batched_X_test))
        print("Validation Accuracy:", acc)
        del correct

        if acc > acc2:
            torch.save(TCM.state_dict(), "TCM_2.pt")

In [37]:
torch.cuda.empty_cache()

In [13]:
TCM.to(device)

ToxicClassifierModel(
  (BiGRU): GRU(300, 128, bidirectional=True)
  (BiRNN): RNN(256, 128, bidirectional=True)
  (hidden1): Linear(in_features=512, out_features=512, bias=True)
  (hidden2): Linear(in_features=512, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=6, bias=True)
)

In [30]:
fit(TCM, batched_X_train, batched_y_train)

Validation Accuracy: 97.92722409909909
Validation Accuracy: 97.97226914414415
Validation Accuracy: 97.92159346846847
Validation Accuracy: 97.93848536036036
Validation Accuracy: 97.92229729729729


In [32]:
torch.save(TCM.state_dict(), "TCM_1.pt")

In [34]:
model = ToxicClassifierModel()
model.load_state_dict(torch.load("TCM_2.pt"))
model.eval()

ToxicClassifierModel(
  (BiGRU): GRU(300, 128, bidirectional=True)
  (BiRNN): RNN(256, 128, bidirectional=True)
  (hidden1): Linear(in_features=512, out_features=512, bias=True)
  (hidden2): Linear(in_features=512, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=6, bias=True)
)

In [36]:
del TCM

In [31]:
from sklearn.metrics import roc_auc_score, f1_score

In [38]:
model.to(device)

ToxicClassifierModel(
  (BiGRU): GRU(300, 128, bidirectional=True)
  (BiRNN): RNN(256, 128, bidirectional=True)
  (hidden1): Linear(in_features=512, out_features=512, bias=True)
  (hidden2): Linear(in_features=512, out_features=512, bias=True)
  (hidden3): Linear(in_features=512, out_features=6, bias=True)
)

In [40]:
predProb = []
truePreds = []
for batch_idx, (X_batch, y_batch) in enumerate(zip(batched_X_test, batched_y_test)):
    var_X_batch = Variable(torch.nn.utils.rnn.pad_sequence([ vectors[X] for X in X_batch]).permute(1,0,2)).float().to(device)
    var_y_batch = Variable(torch.from_numpy(y_batch)).float().to(device)
    output = model(var_X_batch)

    predProb = predProb + [ x for X in output.data for x in X ]
    truePreds = truePreds + [ x for X in var_y_batch for x in X ]

In [47]:
predProb = [ float(p) for p in predProb ]

In [49]:
preds = [ round(p) for p in predProb ]

In [48]:
truePreds = [ int(p) for p in truePreds ]

In [52]:
print(roc_auc_score(truePreds, predProb))
print(f1_score(truePreds, preds))

0.9871023395825137
0.7533161561217673
