In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from sklearn.model_selection import train_test_split

In [2]:
BATCH_SIZE = 32
EPOCHES = 10

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')

START = '<START>'
END = '<END>'
UNKNOWN = '<UNKNOWN>'
PAD = '<PAD>'

texts = pd.concat([train.text, test.text], axis=0)
word2idx = {'<START>': 0, '<END>': 1, '<UNKNOWN': 2, '<PAD>': 3}  # word and its count
sentence_max_length = 0

for text in texts:
    sentence_length = len(text.strip().split())
    if sentence_length > sentence_max_length:
        sentence_max_length = sentence_length
    for word in text.strip().split():
        if not word2idx.get(word, None):
            word2idx[word] = len(word2idx)

vocab_length = len(word2idx)
# Add <start> and <end> token
sentence_max_length += 2

In [None]:
def collate_fn(data):
    data.sort(key=lambda x: len(x[0]), reverse=True)
    inputs, y = zip(*data)
    inputs_len = [len(item) for item in inputs]
    inputs = rnn_utils.pad_sequence(inputs, batch_first=True, padding_value=torch.tensor(3))
    labels = torch.LongTensor(y[:len(inputs_len)])
    return inputs, inputs_len, labels

class myDataSet():
    def __init__(self, inputs, labels):
        self.inputs = list(inputs)
        self.labels = list(labels)

    def to_categorical(y, num_classes):
        return np.eye(num_classes=lasses, dtype='uint8')[y]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        sentence = self.inputs[index]
        inputs = [word2idx.get(word, word2idx.get('<UNKNOWN>')) for word in sentence.split()]

        inputs.insert(0, word2idx.get('<START>'))
        inputs.append(word2idx.get('<END>'))
        while sentence_max_length - len(inputs) > 0:
            inputs.insert(0, word2idx.get('<PAD>'))
        label = self.labels[index]
        return torch.tensor(inputs), torch.tensor(label, dtype=torch.float)

In [None]:
train_data = train.loc[:, ['text', 'target']].sample(frac=1)
train_dataset =  myDataSet(train.text, train.target)
train_x, val_x, train_y, val_y = train_test_split(train_data.text, train_data.target,
                                                 test_size=0.3, random_state=0)

train_dataset = myDataSet(train_x, train_y)
val_dataset = myDataSet(val_x, val_y)

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True,
                                           batch_size=BATCH_SIZE) #, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val_dataset, shuffle=True,
                                         batch_size=BATCH_SIZE) #, collate_fn=collate_fn)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size=300,
                hidden_size=128,
                num_layers=2,
                drop_p=0.5,
                batch_first=True,
                bidirectional=False,
                output_size=3):
        super(LSTM, self).__init__()
        
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(len(word2idx), input_size)

        self.LSTM = nn.LSTM(input_size, hidden_size, num_layers,
                           bidirectional=bidirectional,
                           batch_first=batch_first,
                           dropout=drop_p)
        self.direction = 1
        if bidirectional:
            self.direction = 2
        
        self.fc1 = nn.Linear(hidden_size * self.direction, 1)
        self.fc2 = nn.Linear(sentence_max_length, 1)
        self.dropout = nn.Dropout(drop_p)
        
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.LSTM(out)
#         out_pad, out_pad_len = rnn_utils.pad_packed_sequence(out, batch_first=True)
#         out = out_pad[:, out_pad_len-1, :]
        out = self.fc1(out)
        # out = out.view(out.size()[0], -1)
        out = out[:, -1]
        out = self.sigmoid(out)
        return out

In [None]:
input_size = 300
hidden_size = 128
num_layers = 2
drop_p = 0.5
output_size = 1
bidirectional = True
batch_first = True

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
net = LSTM(input_size,
          hidden_size,
          num_layers,
          drop_p,
          batch_first,
          bidirectional,
          output_size).to(device)

In [None]:
criterion = torch.nn.BCELoss()

In [None]:
from sklearn.metrics import accuracy_score
count = 0
lr = 0.005
valid_loss_min = np.Inf

for item in range(1):
    train_losses = []
    lr = lr - 0.9 * (item % 2) * lr
    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    for inputs, y in train_loader:
        # inputs = rnn_utils.pack_padded_sequence(inputs, inputs_lengths, batch_first=True)
        inputs = inputs.to(device)
        y = y.to(device)
        count += 1
        net.zero_grad()
        out = net(inputs)
        print("Out size:", out.shape)
        print("out", out)
        break
        loss = criterion(out, y)
        train_losses.append(loss.cpu().item())

        loss.backward()
        optimizer.step()
        if count % 10 == 0:
            val_losses = []
            net.eval()
            for val_x, val_y in val_loader:
                # val_x = rnn_utils.pack_padded_sequence(val_x, val_x_lengths, batch_first=True)
                val_x = val_x.to(device)
                val_y = val_y.to(device)
                val_out = net(val_x)
                val_loss = criterion(val_out, val_y)
                val_losses.append(val_loss.cpu().item())
                # val_acc = accuracy_score(val_y.data.cpu().numpy(), val_out.cpu().numpy() > 0.5)
            net.train()
            print("EPOCH: {:d} Train_loss: {:.6f} Val loss: {:.6f}".format(
                item, np.mean(train_losses), np.mean(val_losses)))
            
            if np.mean(val_losses) < valid_loss_min:
                valid_loss_min = np.mean(val_losses)
                print("Validation loss decreased({: .6f} --> {: .6f}) \
                      saving model ...".format(valid_loss_min, np.mean(val_losses)))
                torch.save(net.state_dict(), 'model')
