In [6]:
%reload_ext autoreload
%autoreload 2

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')
device

device(type='cuda')

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
train = pd.read_csv('drive/MyDrive/train_preproc_4.csv', engine='python')
val = pd.read_csv('drive/MyDrive/val_preproc_3.csv', lineterminator='\n', encoding='utf-8')

In [8]:
import json

class Vocabulary:
    def __init__(self, max_size=5000):
        self.string_to_id = {}
        self.id_to_string = {}
    
    def __len__(self):
        return len(self.string_to_id)
    
    def build_vocabulary(self):
        with open('most_common_tokens.json', 'r+') as f:
            self.string_to_id = json.load(f)
        self.id_to_string = {v:k for k,v in self.string_to_id.items()} 


In [9]:
vocab = Vocabulary()
vocab.build_vocabulary()

In [10]:
train = train.dropna(subset=['normalized_text'])
val = val.dropna(subset=['normalized_text'])

In [11]:
from sklearn.model_selection import train_test_split
train, train_val = train_test_split(train, test_size=0.2, random_state=42)

In [12]:
train['tokens'] = train.normalized_text.str.split(' ')
train_val['tokens'] = train_val.normalized_text.str.split(' ')
val['tokens'] = val.normalized_text.str.split(' ')

train['tokens_id'] = train.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 1 for v in x])
train_val['tokens_id'] = train_val.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 1 for v in x])
val['tokens_id'] = val.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 1 for v in x])

In [13]:
class CustomDataset(Dataset):
    def __init__(self, tokens, target):
        self.tokens = tokens
        self.target = target
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        source_tokens = self.tokens[index]
        target_text = self.target[index]
        return torch.tensor(source_tokens), torch.tensor(target_text) 
    
def collate_fn(batch):
    source = [x[0] for x in batch]
    target = [x[1] for x in batch]
    source_len = [len(x) for x in source]
    
    source = [x for x, _ in sorted(zip(source, source_len), key=lambda pair: pair[1], reverse=True)]
    target = [x for x, _ in sorted(zip(target, source_len), key=lambda pair: pair[1], reverse=True)]
    source_len = sorted(source_len, reverse=True)
    
    source = pad_sequence(source, batch_first=True, padding_value = 0) 
    return torch.tensor(source, device=device), torch.tensor(target, device=device), torch.tensor(source_len)

In [14]:
BATCH_SIZE_TRAIN = 256
BATCH_SIZE_TEST = 16

train_dataset = CustomDataset(train.tokens_id.values, train.is_bad.values)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE_TRAIN, shuffle=True, collate_fn = collate_fn)

val_dataset = CustomDataset(train_val.tokens_id.values, train_val.is_bad.values)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE_TEST, shuffle=False, collate_fn = collate_fn)

test_dataset = CustomDataset(val.tokens_id.values, val.is_bad.values)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE_TEST, shuffle=False, collate_fn = collate_fn)

In [15]:
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, layer_dim, vocab_size):
        super(LSTMModel, self).__init__()
        
        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim)
        
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, 1)

    def forward(self, x, input_lens):
        
        embeds = self.word_embeddings(x)
        packed = nn.utils.rnn.pack_padded_sequence(embeds, input_lens, batch_first=True)
        out, (hn, cn) = self.lstm(packed)
        hidden_state = torch.cat((hn[0], hn[1]),dim=1)
        out = self.fc(hidden_state)
        out = F.sigmoid(out)
        return out

In [16]:
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

In [20]:
model = LSTMModel(embedding_dim=6, hidden_dim=64, layer_dim=3, vocab_size=len(vocab)).to(device).train()

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)
NUM_EPOCHS = 3

for epoch in range(NUM_EPOCHS):
    model.train()
    train_losses = []
    val_losses = []
    train_targets = []
    val_targets = []
    train_preds = []
    val_preds = []
    for X_batch, y_batch, seq_len in tqdm(train_loader):
        optimizer.zero_grad()
        ypred_batch = model(X_batch, seq_len)

        # flatten y_batch and ypred_batch
        loss = loss_fn(ypred_batch.squeeze(), y_batch)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        train_preds.append(ypred_batch)
        train_targets.append(y_batch)

    y = torch.cat(train_targets).cpu().detach().numpy()
    pred = torch.cat(train_preds).cpu().detach().numpy()

    print(f'EPOCH {epoch}')
    print('Train AUC:', roc_auc_score(y, pred))

    with torch.no_grad():
        for X_val, y_val, seq_len in val_loader:
            ypred_val = model(X_val, seq_len)

            # flatten first
            val_loss = loss_fn(ypred_val.squeeze(), y_val)
            
            val_losses.append(val_loss.item())
            val_losses.append(loss.item())
            val_preds.append(ypred_val)
            val_targets.append(y_val)

        y = torch.cat(val_targets).cpu().detach().numpy()
        pred = torch.cat(val_preds).cpu().detach().numpy()
    
    print('Val AUC:', roc_auc_score(y, pred))


val_losses = []
val_targets = []
val_preds = []
with torch.no_grad():
    for X_val, y_val, seq_len in test_loader:
        ypred_val = model(X_val, seq_len)

        # flatten first
        val_loss = loss_fn(ypred_val.squeeze(), y_val)
        
        val_losses.append(val_loss.item())
        val_losses.append(loss.item())
        val_preds.append(ypred_val)
        val_targets.append(y_val)

    y = torch.cat(val_targets).cpu().detach().numpy()
    pred = torch.cat(val_preds).cpu().detach().numpy()
    print('\nFINAL TEST AUC:', roc_auc_score(y, pred))

  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 0
Train AUC: 0.8970353861293481
Val AUC: 0.8659175897719298


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 1
Train AUC: 0.8019202103029318
Val AUC: 0.8009745935668179


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 2


ValueError: ignored