In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd

from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')
device

device(type='cuda')

In [None]:
train = pd.read_csv('drive/MyDrive/train_preproc_4.csv', engine='python')
val = pd.read_csv('drive/MyDrive/val_preproc_3.csv', lineterminator='\n', encoding='utf-8')

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
import json

class Vocabulary:
    def __init__(self, max_size=5000):
        self.string_to_id = {}
        self.id_to_string = {}
    
    def __len__(self):
        return len(self.string_to_id)
    
    def build_vocabulary(self):
        with open('most_common_tokens.json', 'r+') as f:
            self.string_to_id = json.load(f)
        self.id_to_string = {v:k for k,v in self.string_to_id.items()} 


In [None]:
vocab = Vocabulary()
vocab.build_vocabulary()

In [None]:
train = train.dropna(subset=['normalized_text'])
val = val.dropna(subset=['normalized_text'])

In [None]:
from sklearn.model_selection import train_test_split
train, train_val = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
train['tokens'] = train.normalized_text.str.split(' ')
val['tokens'] = val.normalized_text.str.split(' ')
train_val['tokens'] = train_val.normalized_text.str.split(' ')

train['tokens_id'] = train.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 2 for v in x])
val['tokens_id'] = val.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 2 for v in x])
train_val['tokens_id'] = train_val.tokens.map(lambda x: [vocab.string_to_id.get(v, -1) + 2 for v in x])

In [None]:
import numpy as np

class CustomDataset(Dataset):
    def __init__(self, tokens, target):
        self.tokens = tokens
        self.target = target
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        source_tokens = self.tokens[index]
        target_text = self.target[index]
        return torch.tensor(source_tokens), torch.tensor(target_text) 
    
def collate_fn(batch):
    source = [x[0] for x in batch]
    target = [x[1] for x in batch]

    source = [F.pad(t, (0,400-len(t) if len(t) < 400 else 0), "constant", 0).unsqueeze(dim=1) for t in source]
    return (torch.cat(source, dim=1).permute(1,0).to(device), torch.tensor(target, device=device))

In [None]:
torch.manual_seed(42)

BATCH_SIZE_TRAIN = 256
BATCH_SIZE_TEST = 16

train_dataset = CustomDataset(train.tokens_id.values, train.is_bad.values)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE_TRAIN, shuffle=True, collate_fn = collate_fn)

val_dataset = CustomDataset(train_val.tokens_id.values, train_val.is_bad.values)
val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE_TEST, shuffle=False, collate_fn = collate_fn)

test_dataset = CustomDataset(val.tokens_id.values, val.is_bad.values)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE_TEST, shuffle=False, collate_fn = collate_fn)

In [None]:
class CNNModel(nn.Module):
    def __init__(self, embedding_dim, vocab_size, filter_sizes=[3, 3], num_filters=[16, 32]):
        super(CNNModel, self).__init__()

        self.embedding_dim = embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size+2, self.embedding_dim)
        #self.max_pooling = nn.MaxPool1d()
        self.dropout = nn.Dropout(0.2)
        list_input = [self.embedding_dim] + num_filters[:-1]

        self.conv1d_list = nn.ModuleList([
                    nn.Conv1d(in_channels=list_input[i],
                            out_channels=num_filters[i],
                            kernel_size=filter_sizes[i])
                    for i in range(len(filter_sizes))
                ])

        self.fc = nn.Linear(num_filters[-1], 1)
        #self.fc = nn.Linear(128, 1)


    def forward(self, x):
        out = self.word_embeddings(x.long())
        out = out.permute(0, 2, 1)
        for i, conv1d in enumerate(self.conv1d_list):
            out = F.relu(conv1d(out))
            ks = 2 if i != len(self.conv1d_list)-1 else out.shape[2]
            out = F.max_pool1d(out, kernel_size=ks)

        #out = self.fc_1(self.dropout(out.squeeze()))
        out = self.fc(out.squeeze())
        out = torch.sigmoid(out)
        return out

In [None]:
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

model = CNNModel(embedding_dim=8, vocab_size=len(vocab), 
                 filter_sizes=[6, 3, 1], num_filters=[16, 32, 64]).to(device).eval().train()

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    train_losses = []
    val_losses = []
    train_targets = []
    val_targets = []
    train_preds = []
    val_preds = []
    for X_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        ypred_batch = model(X_batch)

        # flatten y_batch and ypred_batch
        loss = loss_fn(ypred_batch.squeeze(), y_batch)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        train_preds.append(ypred_batch)
        train_targets.append(y_batch)

    y = torch.cat(train_targets).cpu().detach().numpy()
    pred = torch.cat(train_preds).cpu().detach().numpy()

    print(f'EPOCH {epoch}')
    print('Train AUC:', roc_auc_score(y, pred))

    with torch.no_grad():
        for X_val, y_val in val_loader:
            ypred_val = model(X_val)

            # flatten first
            val_loss = loss_fn(ypred_val.squeeze(), y_val)
            
            val_losses.append(val_loss.item())
            val_losses.append(loss.item())
            val_preds.append(ypred_val)
            val_targets.append(y_val)

        y = torch.cat(val_targets).cpu().detach().numpy()
        pred = torch.cat(val_preds).cpu().detach().numpy()
    
    print('Val AUC:', roc_auc_score(y, pred))


val_losses = []
val_targets = []
val_preds = []
with torch.no_grad():
    for X_val, y_val in test_loader:
        ypred_val = model(X_val)

        # flatten first
        val_loss = loss_fn(ypred_val.squeeze(), y_val)
        
        val_losses.append(val_loss.item())
        val_losses.append(loss.item())
        val_preds.append(ypred_val)
        val_targets.append(y_val)

    y = torch.cat(val_targets).cpu().detach().numpy()
    pred = torch.cat(val_preds).cpu().detach().numpy()
    print('\nFINAL TEST AUC:', roc_auc_score(y, pred))

  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 0
Train AUC: 0.9140718971000187
Val AUC: 0.9142169363978395


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 1
Train AUC: 0.8999235106055568
Val AUC: 0.8625245891088591


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 2
Train AUC: 0.8710616208161093
Val AUC: 0.8723817517745962


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 3
Train AUC: 0.8831372465783022
Val AUC: 0.8876692948871633


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 4
Train AUC: 0.8928905459165404
Val AUC: 0.8978401958291984


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 5
Train AUC: 0.855799029526848
Val AUC: 0.8463260772754436


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 6
Train AUC: 0.8817532549746315
Val AUC: 0.8950247747726022


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 7
Train AUC: 0.8900008371122378
Val AUC: 0.8847821783742791


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 8
Train AUC: 0.7540020042751647
Val AUC: 0.500962927440942


  0%|          | 0/2461 [00:00<?, ?it/s]

EPOCH 9
Train AUC: 0.5012404274864176
Val AUC: 0.5010426110935629


RuntimeError: ignored