In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertPreTrainedModel, BertModel, BertConfig, BertForMaskedLM, BertForSequenceClassification
from pathlib import Path
import torch
import copy
import re
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import time
import pandas as pd
import collections
import os
import pdb
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import csv
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
train = pd.read_table('glue_data/CoLA/train.tsv',header=None)
dev = pd.read_table('glue_data/CoLA/dev.tsv',header=None)
test = pd.read_table('glue_data/CoLA/test.tsv')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
dev.head()

In [None]:
train_labels = train[1].values.astype(int)

In [None]:
dev_labels = dev[1].values.astype(int)

In [None]:
len(train_labels)

In [None]:
np.sum(train_labels)

In [None]:
len(dev_labels)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [None]:
tokenized_text = tokenizer.tokenize(train[3].values[1])

In [None]:
train[3].values[1]

In [None]:
tokenized_text

In [None]:
tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
max_seq_len = 45

In [None]:
def tokenize_text(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # add padding
    pad = [0]*(max_seq_len-len(indexed_tokens))
    indexed_tokens += pad
    return np.array(indexed_tokens)

In [None]:
tok_train = np.array([tokenize_text(t) for t in train[3].values])
tok_test = np.array([tokenize_text(t) for t in test['sentence'].values])
tok_dev = np.array([tokenize_text(t) for t in dev[3].values])

In [None]:
tok_test[0]

In [None]:
len(tok_train)

In [None]:
len(tok_dev)

In [None]:
class BertLayerNorm(nn.Module):
        def __init__(self, hidden_size, eps=1e-12):
            """Construct a layernorm module in the TF style (epsilon inside the square root).
            """
            super(BertLayerNorm, self).__init__()
            self.weight = nn.Parameter(torch.ones(hidden_size))
            self.bias = nn.Parameter(torch.zeros(hidden_size))
            self.variance_epsilon = eps

        def forward(self, x):
            u = x.mean(-1, keepdim=True)
            s = (x - u).pow(2).mean(-1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
            return self.weight * x + self.bias
        

class BertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=2):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-large-uncased').to(device)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.ff = nn.Linear(config.hidden_size,100)
        self.bn = nn.BatchNorm1d(num_features=100)
        self.classifier = nn.Linear(100, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.xavier_normal_(self.ff.weight)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(F.relu(self.bn(self.ff(pooled_output))))

        return logits
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True



In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [None]:
text ='what is a pug'

In [None]:
zz = tokenizer.tokenize(text)

In [None]:
device = torch.device('cuda:0')

In [None]:
from pytorch_pretrained_bert import BertConfig

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=1024,
        num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096)

num_labels = 2
model = BertForSequenceClassification(num_labels=num_labels)
model.to(device)

In [None]:
from torch.utils import data

class Dataset(data.Dataset):
    def __init__(self, vecs, labels):
        'Initialization'
        self.labels = labels
        self.vecs = vecs
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.vecs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = self.vecs[index].astype(np.int64)
        y = self.labels[index].astype(np.int64)

        return X, y

In [None]:
np.hstack([train_labels,dev_labels])

In [None]:
tok_train.shape

In [None]:
np.vstack([tok_train,tok_dev]).shape

In [None]:
train_dataset = Dataset(np.vstack([tok_train,tok_dev]),np.hstack([train_labels,dev_labels]))
dev_dataset = Dataset(tok_dev,dev_labels)

In [None]:
params = {'batch_size': 16,
          'shuffle': True}
max_epochs = 50

In [None]:
train_gen = data.DataLoader(train_dataset, **params)
dev_gen = data.DataLoader(dev_dataset, **params)

In [None]:
dataloaders_dict = {
    'train': train_gen,
    'val': dev_gen
}

In [None]:
X_sample, y_sample = train_dataset[0:10]

In [None]:
X_sample = torch.tensor(X_sample).to(device)
y_sample = torch.tensor(y_sample).to(device)

In [None]:
y_sample

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import matthews_corrcoef

In [None]:
def log_metrics(y_pred, y_true):
    print('Accuracy:', accuracy_score(y_true,y_pred))
    print('MCC:', matthews_corrcoef(y_true,y_pred))

In [None]:
for X_batch, y_batch in dataloaders_dict['train']:
    X = X_batch
    y = y_batch
    break

In [None]:
torch.cuda.empty_cache()

In [None]:
X = X.to(device)
y = y.to(device)
preds = F.softmax(model(X),dim=1)
preds = torch.argmax(preds,dim=1).cpu().data.numpy()

In [None]:
true = y.cpu().data.numpy()

In [None]:
log_metrics(preds,true)

In [None]:
preds

In [None]:
X.shape

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            
            
            # Iterate over data.
            bind = 0
            for X_batch, y_batch in dataloaders_dict[phase]:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                bind += 1
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    #print(inputs)
                    logits = model(X_batch)
                    
                    loss = criterion(logits, y_batch)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() 

                

                
            epoch_loss = running_loss /(bind*X_batch.size(0)) 

            

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            preds = F.softmax(logits,dim=1)
            preds = torch.argmax(preds,dim=1).cpu().data.numpy()
            true = np.array(y_batch.cpu().data.numpy())
            log_metrics(preds,true)
            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')


        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
model.to(device)

In [None]:
lrlast = .001
lrmain = .00001
optim1 = torch.optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.05)

In [None]:
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=100)

In [None]:
X_test = torch.tensor(tok_test)

In [None]:
X_test

In [None]:
model.cpu()

In [None]:
test_pred = model(X_test)

In [None]:
test_pred = F.softmax(test_pred,dim=1)

In [None]:
test_pred

In [None]:
test_pred = torch.argmax(test_pred,dim=1)

In [None]:
test_pred = test_pred.data.numpy()

In [None]:
test.head()

In [None]:
test['prediction'] = test_pred

In [None]:
test['ID'] = test['index']

In [None]:
test_ = test[['ID', 'prediction']]

In [None]:
test_.to_csv('sample_submission.csv')

In [None]:
in_domain = pd.read_table('cola_in_domain_test.tsv')

In [None]:
in_domain.head()

In [None]:
tok_in_domain = np.array([tokenize_text(t) for t in in_domain['Sentence'].values])

In [None]:
X_in_domain = torch.tensor(tok_in_domain)

In [None]:
pred_in_domain = model(X_in_domain)
pred_in_domain = F.softmax(pred_in_domain,dim=1)
pred_in_domain = torch.argmax(pred_in_domain,dim=1).data.numpy()

In [None]:
in_domain['Prediction'] = pred_in_domain

In [None]:
df = in_domain[['Id','Prediction']]

In [None]:
df['Label'] = pred_in_domain

In [None]:
df = df.drop(['Prediction'],axis=1)

In [None]:
df.columns

In [None]:
df[['Id','Label']].head()

In [None]:
df.to_csv('in_domain_submission.csv',index=False)

In [None]:
len(df)

In [None]:
sum(df.Label.values)

In [None]:
df.columns

In [None]:
ood = pd.read_table('cola_out_of_domain_test.tsv')

In [None]:
ood.head()

In [None]:
tok_out_domain = np.array([tokenize_text(t) for t in ood['Sentence'].values])
X_out_domain = torch.tensor(tok_out_domain)
pred_out_domain = model(X_out_domain)
pred_out_domain = F.softmax(pred_out_domain,dim=1)
pred_out_domain = torch.argmax(pred_out_domain,dim=1).data.numpy()

In [None]:
ood['Prediction'] = pred_out_domain
df = ood[['Id','Prediction']]
df = df.drop(['Prediction'],axis=1)

In [None]:
df['Label'] = pred_out_domain

In [None]:
df[['Id','Label']].head()

In [None]:
len(df)

In [None]:
sum(df.Label)

In [None]:
df.to_csv('out_domain_submission.csv',index=False)