In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install transformers==3

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 2.9 MB/s eta 0:00:01
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 16.4 MB/s eta 0:00:01
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.9.4
    Uninstalling tokenizers-0.9.4:
      Successfully uninstalled tokenizers-0.9.4
  Attempting uninstall: transformers
    Found existing installation: transformers 4.0.1
    Uninstalling transformers-4.0.1:
      Successfully uninstalled transformers-4.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 1.3.0 requires transformers<4.1,>=4.0, but you have transformers 3.0.0 wh

In [3]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [4]:
DIR = '../input/nlp-getting-started'

In [5]:
import pandas as pd

train = pd.read_csv(os.path.join(DIR,'train.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))
sb = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))

In [6]:
print(train.shape)
train.head(3)

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [7]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn import metrics
from tqdm import tqdm



In [70]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCH = 1
LEARNING_RATE = 1e-05
TOKENIZER = BertTokenizer.from_pretrained("bert-base-uncased")

In [71]:
class DisasterTweetDataset(Dataset):
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.text)
        
    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[index], dtype=torch.float),
        } 

In [72]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    
    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids,
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [73]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

In [74]:
def train_fn(epoch, data_loader, model, optimizer, device):
    model.train()
    for _, data in enumerate(data_loader, 0):
        ids = data["ids"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        
        loss = loss_fn(outputs, targets)
        if _ % 500 == 0:
            print(f"Epoch: {epoch}, Train loss: {loss.item()}")
        
        loss.backward()
        optimizer.step()

In [75]:
def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(data_loader, 0):
            ids = data["ids"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
        
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

In [69]:
from sklearn import model_selection

def run():
    dfx = train
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.3,
        random_state=42,
        stratify=dfx.target.values,
    )
    
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    train_set = DisasterTweetDataset(
        text=df_train.text.values, 
        target=df_train.target.values,
    )
    valid_set = DisasterTweetDataset(
        text=df_valid.text.values, 
        target=df_valid.target.values,
    )
    
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=1,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_set,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1,
    )
    
    device = DEVICE
    model = BERTClass()
    model.to(device)
    
    optimizer = torch.optim.Adam(
        params=model.parameters(), 
        lr=LEARNING_RATE,
    )
    
    best_accuracy = 0
    for epoch in range(EPOCH):
        train_fn(
            epoch,
            train_loader, 
            model, 
            optimizer, 
            device,
        )
        outputs, targets = eval_fn(
            valid_loader, 
            model, 
            device,
        )
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy={accuracy:.4f}")


if __name__ == "__main__":
    run()

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 15.90 GiB total capacity; 14.91 GiB already allocated; 3.75 MiB free; 15.03 GiB reserved in total by PyTorch)