In [1]:
# all the import
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
from collections import defaultdict

%matplotlib inline

In [2]:
# all the constant
train_dir = "data/preprocessed/small/train.csv"
eval_dir = "data/preprocessed/small/dev.csv"
test_dir = "data/preprocessed/small/test.csv"

model_path = 'bert-base-uncased'
batch_size = 8
num_classes = 5
epoch = 10
learning_rate = 1e-5

random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'
device

'cpu'

In [4]:
df_train = pd.read_csv(train_dir)
df_train['sentiment'] = df_train['sentiment'].map(lambda x: x-1)

In [5]:
df_eval = pd.read_csv(eval_dir)
df_eval['sentiment'] = df_eval['sentiment'].map(lambda x: x-1)
df_eval.head()

Unnamed: 0,sentiment,text
0,0,driver less cars could be potentially lethal n...
1,0,driverless cars. dumbest f..ing idea i have ev...
2,0,what is this i am hearing about driverless car...
3,0,no way would i put my trust in a driverless ca...
4,0,yes i just meant in the sense that people on i...


In [6]:
tokenizer = BertTokenizer.from_pretrained(model_path)

In [7]:
max_length = 64

In [8]:
class TwitterDataset(Dataset):
    def __init__(self, texts, sentiments, tokenizer, max_len):
        self.texts = texts
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        sentiment = int(self.sentiments[item])
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_len, 
            return_token_type_ids=False, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt',
        )
        return {
            'text': text, 
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment': torch.tensor(sentiment, dtype=torch.long)
        }

In [9]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TwitterDataset(
        texts=df.text.to_numpy(),
        sentiments=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(
        ds, 
        batch_size=batch_size,
        num_workers=0,
        shuffle=True,
    )

In [10]:
train_data_loader = create_data_loader(df_train, tokenizer, max_length, batch_size)
eval_data_loader = create_data_loader(df_eval, tokenizer, max_length, batch_size)

In [11]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_path)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False,
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [12]:
model = SentimentClassifier(num_classes)
model = model.to(device)


In [13]:
optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
total_steps = len(train_data_loader) * epoch

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps,
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [14]:
def train_epoch(
    model, 
    data_loader,
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    n_examples
):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['sentiment'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [15]:
def eval_epoch(
    model, 
    data_loader,
    loss_fn, 
    device, 
    n_examples
):
    model = model.eval()

    losses = []
    correct_predictions = 0

    preds_list = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['sentiment'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            preds_list.append(preds)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    print(preds_list)

    return correct_predictions.double() / n_examples, np.mean(losses)

In [16]:
%%time

history = defaultdict(list)
best_accuracy = 0

for e in range(epoch):
  print(f'Epoch {e + 1}/{epoch}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')

  eval_acc, eval_loss = eval_epoch(
    model,
    eval_data_loader,
    loss_fn, 
    device, 
    len(df_eval)
  )
  print(f'Eval loss {eval_loss} accuracy {eval_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['eval_acc'].append(eval_acc)
  history['eval_loss'].append(eval_loss)

  if eval_acc > best_accuracy:
    torch.save(model.state_dict(), 'output/small_lr1e-5/best_model_state.bin')
    best_accuracy = eval_acc



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/10
----------




Train loss 1.1197627259760488 accuracy 0.6053639846743295
[tensor([2, 2, 3, 2, 2, 3, 2, 2]), tensor([2, 2, 2, 2, 2, 3, 2, 2]), tensor([2, 2, 2, 2, 2, 3, 2, 2]), tensor([2, 3, 3, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 3, 3, 2, 2]), tensor([2, 3, 2, 2, 2, 3, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 3, 3, 2, 2, 3, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 3, 2, 2, 2, 3, 2, 3]), tensor([2, 2, 3, 2, 3, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 3, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([3, 2, 2, 2, 2, 2, 2, 3]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([3, 3, 2, 2, 2, 3, 2, 2]), tensor([2, 2, 2, 2, 2, 2, 2, 2]), tensor([3, 2, 2, 2, 3, 2])]
Eval loss 1.0007480335235597 accuracy 0.6212121212121212

Epoch 2/10
----------
Train loss 0.8096

In [17]:
# test
test_dir = "data/preprocessed/small/test.csv"

In [18]:
df_test = pd.read_csv(test_dir)
df_test.head()

Unnamed: 0,id,text
0,1,two places i'd invest all my money if i could ...
1,2,awesome! google driverless cars will help the ...
2,3,autonomous vehicles could reduce traffic fatal...
3,4,really good presentation from jan becker on bo...
4,5,ford just revealed it's automated ford fusion ...


In [19]:
class TwitterDataset_test(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=self.max_len, 
            return_token_type_ids=False, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt',
        )
        return {
            'text': text, 
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [20]:
def create_data_loader_test(df, tokenizer, max_len, batch_size):
    ds = TwitterDataset_test(
        texts=df.text.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(
        ds, 
        batch_size=batch_size,
        num_workers=0,
        shuffle=False,
    )

In [21]:
test_data_loader = create_data_loader_test(df_test, tokenizer, max_length, batch_size)

In [22]:
data = next(iter(test_data_loader))
data.keys()



dict_keys(['text', 'input_ids', 'attention_mask'])

In [23]:
def get_predictions(model, data_loader):
    model = model.eval()
  
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds)

    predictions = torch.stack(predictions).cpu()
    return predictions

In [24]:
test_preds = get_predictions(model, test_data_loader)
test_preds

tensor([3, 4, 3, 3, 4, 3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3,
        3, 3, 2, 3, 4, 4, 3, 3, 4, 3, 3, 3, 2, 2, 2, 3, 4, 4, 4, 3, 3, 3, 2, 2,
        4, 3, 4, 4, 3, 4, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3,
        3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 4, 2, 2, 3, 2, 1, 2, 3, 3, 3, 3, 4, 3, 4,
        3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 1, 4, 3, 3, 3, 3, 3, 3, 3, 4, 3,
        2, 4, 3, 1, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 2, 3, 3, 3, 2, 2, 3, 2, 3, 4,
        4, 3, 1, 2, 3, 2, 1, 4, 4, 3, 2, 2, 2, 1, 1, 1, 2, 3, 3, 2, 3, 4, 3, 2,
        2, 2, 2, 3, 3, 1, 2, 3, 2, 2, 1, 3, 3, 2, 3, 2, 2, 3, 4, 2, 3, 3, 3, 2,
        2, 2, 3, 2, 4, 3, 4, 1, 3, 3, 3, 4, 2, 3, 2, 2, 3, 2, 1, 2, 4, 3, 3, 3,
        2, 3, 2, 3, 3, 3, 3, 4, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2,

In [25]:
array = np.array(test_preds)
df_preds = pd.DataFrame(array)

import csv
with open("output/small_lr1e-5/submit.csv", 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id','sentiment'])
    index = 1
    for d in df_preds[0]:
        writer.writerow((index,d+1))
        index += 1