# Detecting Political Bias

In [1]:
import sys, os
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
%load_ext autoreload
%autoreload 2
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
from apex import amp
import shutil

In [2]:
# Let's activate CUDA for GPU based operations
device=torch.device('cuda')

Change the PATH variable to whereever your `week06/hw` directory is located.  
**For the final run we would like you to have a train_size of at least 1 Million rows, and a valid size of at least 500K rows. When you first run the script, feel free to work with a reduced train and valid size for speed.** 

In [3]:
# In bert we need all inputs to have the same length, we will use the first 220 characters. 
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
# We shall run a single epoch (ie. one pass over the data)
EPOCHS = 1
PATH = '/root/w266_final_project' # /root/v2/week06/hw"
DATA_DIR = os.path.join(PATH, "../data")
WORK_DIR = os.path.join(PATH, "../workingdir")

# Validation and training sizes are here. 
train_size= 1000 #  10000
valid_size= 200  #  5000

This should be the files you downloaded earlier when you ran `download.sh`

In [4]:
os.listdir(DATA_DIR)

['uncased_L-12_H-768_A-12',
 'articles2.csv',
 'conCons2.pickle',
 'articles3.csv',
 'ibcData.pkl',
 'filteredConvote2.pickle',
 'filteredIBC.pickle',
 'conGrams2.pickle',
 'filteredConvote.pickle',
 'filteredIBC_rootOnly.pickle',
 'wwm_uncased_L-24_H-1024_A-16',
 'all-the-news.zip',
 'libGrams2.pickle',
 'libCons2.pickle',
 'cased_L-12_H-768_A-12',
 'filtered_sentences.pickle',
 'articles1.csv']

Load the data and transform into tsv for bert

In [5]:
with open('../data/filteredConvote.pickle', 'rb') as f:
    convote = pickle.load(f)
with open('../data/filteredIBC.pickle', 'rb') as f:
    ibc = pickle.load(f)
df = convote.append(ibc)

In [6]:
df.head()

Unnamed: 0,label,text
0,conservative,"madam speaker , i rise in strong support of th..."
1,conservative,"let me note that i am one of several , if not ..."
2,conservative,i supported the patriot act and would have aga...
3,conservative,"that , i do not believe , should be tolerated ..."
4,conservative,"second of all , let me note that any , any inv..."


We shall install pytorch BERT implementation.   
If you would like to experiment with or view any code (purely optional, and not graded :) ), you can copy the files from the repo https://github.com/huggingface/pytorch-pretrained-BERT  

In [7]:
%%capture
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam
from pytorch_pretrained_bert.modeling import BertModel
from pytorch_pretrained_bert import BertConfig

We shall now load the model. When you run this, comment out the `capture` command to understand the archecture.

In [8]:
%%capture
# Translate model from tensorflow to pytorch
BERT_MODEL_PATH = os.path.join(DATA_DIR, 'cased_L-12_H-768_A-12')
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
                            os.path.join(BERT_MODEL_PATH, 'bert_model.ckpt'),
                            os.path.join(BERT_MODEL_PATH, 'bert_config.json'), 
                            os.path.join(WORK_DIR, 'pytorch_model.bin'))

shutil.copyfile(os.path.join(BERT_MODEL_PATH, 'bert_config.json'), \
                os.path.join(WORK_DIR, 'bert_config.json'))
# This is the Bert configuration file
bert_config = BertConfig(os.path.join(WORK_DIR, 'bert_config.json'))

Bert needs a special formatting of sentences, so we have a sentence start and end token, as well as separators.   
Thanks to this [script](https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming) for a fast convertor of the sentences.

In [9]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

Now we load the BERT tokenizer and convert the sentences.

In [10]:
# %%time
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
# train_all = pd.read_csv(os.path.join(DATA_DIR, "train.csv")).sample(train_size+valid_size,random_state=SEED)
train_all = df.sample(train_size+valid_size,random_state=SEED)
print('loaded %d records' % len(train_all))

# Make sure all comment_text values are strings
train_all['text'] = train_all['text'].astype(str) 
# print(train_all)

sequences = convert_lines(train_all["text"],MAX_SEQUENCE_LENGTH,tokenizer)
# train_all=train_all.fillna(0)

loaded 1200 records


HBox(children=(IntProgress(value=0, max=1200), HTML(value='')))


1


Let us look at how the tokenising works in BERT, see below how it recongizes misspellings - words the model never saw. 

In [11]:
train_all[["text", 'label']].head()

Unnamed: 0,text,label
166053,Epstein argues on both sides of this issue by ...,neutral
8991,"if the gentleman will yield , i want us to tal...",conservative
84163,may benefit the United States as a whole,neutral
5072,i have looked carefully at the law and i have ...,conservative
23065,this is not the original version of the bill t...,conservative


Lets tokenize some text (I intentionally mispelled some words to check berts subword information handling)

In [12]:
text = 'Hi, I am learning new things in w251 about deep learning the cloud and teh edge.'
tokens = tokenizer.tokenize(text)
' '.join(tokens)

'hi , i am learning new things in w ##25 ##1 about deep learning the cloud and te ##h edge .'

Added start and end token and convert to ids. This is how it is fed into BERT.

In [13]:
tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokens)
' '.join(map(str, input_ids))

'101 20844 117 178 1821 3776 1207 1614 1107 192 17600 1475 1164 1996 3776 1103 7180 1105 21359 1324 2652 119 102'

When BERT converts this sentence to a torch tensor below is shape of the stored tensors.  
We have 12 input tensors, while the sentence tokens has length 23; where are can you see the 23 tokens in the tensors ?... **Feel free to post in slack or discuss in class**

In [14]:
# put input on gpu and make prediction
bert = BertModel.from_pretrained(WORK_DIR).cuda()
bert_output = bert(torch.tensor([input_ids]).cuda())

print('Sentence tokens {}'.format(tokens))
print('Number of tokens {}'.format(len(tokens)))
print('Tensor shapes : {}'.format([b.cpu().detach().numpy().shape for b in bert_output[0]]))
print('Number of torch tensors : {}'.format(len(bert_output[0])))

Sentence tokens ['[CLS]', 'hi', ',', 'i', 'am', 'learning', 'new', 'things', 'in', 'w', '##25', '##1', 'about', 'deep', 'learning', 'the', 'cloud', 'and', 'te', '##h', 'edge', '.', '[SEP]']
Number of tokens 23
Tensor shapes : [(1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768), (1, 23, 768)]
Number of torch tensors : 12


As it is a binary problem, we change our target to [0,1], instead of float.   
We also split the dataset into a training and validation set, 

In [15]:
train_all.loc[train_all['label'] == 'liberal', 'label'] = 0
train_all.loc[train_all['label'] == 'neutral', 'label'] = 1
train_all.loc[train_all['label'] == 'conservative', 'label'] = 2

In [16]:
# Training data - sentences
X = sequences[:train_size] 
# Target - the toxicity. 
y = train_all[['label']].values[:train_size]
X_val = sequences[train_size:]                
y_val = train_all[['label']].values[train_size:]

In [17]:
print(X)

[[  101   174  3491 ...     0     0     0]
 [  101  1191  1103 ...     0     0     0]
 [  101  1336  5257 ...     0     0     0]
 ...
 [  101 16821  2993 ...     0     0     0]
 [  101   182  1197 ...     0     0     0]
 [  101 22035  1174 ...     0     0     0]]


In [18]:
print(y)

[[1]
 [2]
 [1]
 [2]
 [2]
 [0]
 [2]
 [1]
 [0]
 [2]
 [1]
 [0]
 [1]
 [2]
 [2]
 [0]
 [2]
 [0]
 [2]
 [2]
 [0]
 [2]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [2]
 [0]
 [2]
 [2]
 [1]
 [1]
 [0]
 [2]
 [0]
 [0]
 [0]
 [2]
 [2]
 [2]
 [0]
 [2]
 [1]
 [0]
 [0]
 [0]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [0]
 [2]
 [2]
 [0]
 [2]
 [1]
 [2]
 [2]
 [0]
 [1]
 [1]
 [1]
 [2]
 [0]
 [0]
 [0]
 [1]
 [2]
 [1]
 [0]
 [2]
 [2]
 [1]
 [0]
 [0]
 [0]
 [2]
 [1]
 [2]
 [2]
 [2]
 [0]
 [0]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [2]
 [0]
 [0]
 [2]
 [0]
 [0]
 [0]
 [2]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [1]
 [0]
 [0]
 [2]
 [1]
 [2]
 [2]
 [2]
 [1]
 [2]
 [2]
 [0]
 [2]
 [1]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [0]
 [2]
 [0]
 [2]
 [2]
 [2]
 [0]
 [2]
 [1]
 [2]
 [2]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [2]
 [2]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [2]
 [2]
 [2]
 [0]
 [0]
 [0]
 [2]
 [0]
 [1]
 [1]
 [2]


In [19]:
test_df=train_all.tail(valid_size).copy()
train_df=train_all.head(train_size)

**From here on in we would like you to run BERT.**   
**Please do rely on the script available -  [Kaggle kernel](https://www.kaggle.com/yuval6967/toxic-bert-plain-vanila) from [yuval r](https://www.kaggle.com/yuval6967) - for at least the first few steps up to training and prediction.**


**1)**   
**Load the training set to a training dataset. For this you need to load the X sequences and y objects to torch tensors**   
**You can use `torch.utils.data.TensorDataset` to input these into a train_dataset.**

In [20]:
# Training data creations
train_dataset = torch.utils.data.TensorDataset(torch.tensor(list(X),dtype=torch.float), torch.tensor(list(y),dtype=torch.float))

**2)**  
**Set your learning rate and batch size; and optionally random seeds if you want reproducable results**   
**Load your pretrained BERT using `BertForSequenceClassification`**   
**Initialise the gradients and place the model on cuda, set up your optimiser and decay parameters**
**Initialise the model with `apex` (we imprted this as `amp`) for mixed precision training**

In [21]:
output_model_file = "bert_pytorch.bin"

lr=2e-5
batch_size = 32
accumulation_steps=2
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained(WORK_DIR,cache_dir=None,num_labels=3)
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model=model.train()

<torch._C.Generator at 0x7f2cb201fdf0>

**3)**  
**Start training your model by iterating through batches in a single epoch of the data**

In [None]:
tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
        loss =  F.cross_entropy(y_pred,y_batch.to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)

**4)**  
**Store your trained model to disk, you will need it if you choose section 8C.**

In [None]:
torch.save(model.state_dict(), output_model_file)

**5)**   
**Now make a prediction for your validation set.**  

In [None]:
model = BertForSequenceClassification(bert_config,num_labels=1)
model.load_state_dict(torch.load(output_model_file ))
model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.eval()
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
for i,(x_batch,)  in enumerate(tk0):
    pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    valid_preds[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy()

**6)**  
**In the yuval's kernel he get a metric based on the metric for the jigsaw competition - it is quite complicated. Instead, we would like you to measure the `AUC`, similar to how you did in homework 04. You can compare the results to HW04**  
*A tip, if your score is lower than homework 04 something is wrong....*

In [None]:
y_pred=torch.sigmoid(torch.tensor(valid_preds)).numpy()
print('AUC score : {:.5f}'.format(roc_auc_score(y_val, y_pred)))

**7)**  
**Can you show/print the validation sentences predicted with the highest and lowest toxicity ?**

In [None]:
sorted_pred = np.argsort(y_pred)
print("Highest Score: {}".format(y_pred[sorted_pred[-1]]))
print("Highest: {}".format(train_all[['comment_text']].values[sorted_pred[-1]]))
print("Lowest Score: {}".format(y_pred[sorted_pred[0]]))
print("Lowest: {}".format(train_all[['comment_text']].values[sorted_pred[0]]))

**8)**  
**Pick only one of the below items and complete it. The last two will take a good amount of time (and partial success on them is fine), so proceed with caution on your choice of items :)** 
  
  
**A. Can you train on two epochs ?**

**B. Can you change the learning rate and improve validation score ?**
   
**C. Make a prediction on the test data set with your downloaded model and submit to Kaggle to see where you score on public LB - check out [Abhishek's](https://www.kaggle.com/abhishek) script - https://www.kaggle.com/abhishek/pytorch-bert-inference**  
  
**D. Get BERT running on the tx2 for a sample of the data.** 
  
**E. Finally, and very challenging -- the `BertAdam` optimiser proved to be suboptimal for this task. There is a better optimiser for this dataset in this script [here](https://www.kaggle.com/cristinasierra/pretext-lstm-tuning-v3). Check out the `custom_loss` function. Can you implement it ? It means getting under the hood of the `BertForSequenceClassification` at the source repo and implementing a modified version locally .  `https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py`**

## Part A

In [None]:
### PART A ###
output_model_file = "bert_pytorch_part_a.bin"
EPOCHS_A = 2

lr=2e-5
batch_size = 32
accumulation_steps=2
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model_a = BertForSequenceClassification.from_pretrained(WORK_DIR,cache_dir=None,num_labels=1)
model_a.zero_grad()
model_a = model_a.to(device)
param_optimizer = list(model_a.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS_A*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model_a, optimizer = amp.initialize(model_a, optimizer, opt_level="O1",verbosity=0)
model_a=model_a.train()

In [None]:
### PART A ###
tq = tqdm_notebook(range(EPOCHS_A))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model_a(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
        loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)

In [None]:
torch.save(model_a.state_dict(), output_model_file)

In [None]:
model_a = BertForSequenceClassification(bert_config,num_labels=1)
# model_a = BertForSequenceClassification.from_pretrained(output_model_file)
model_a.load_state_dict(torch.load(output_model_file ))
model_a.to(device)
for param in model_a.parameters():
    param.requires_grad=False
model_a.eval()
valid_preds_a = np.zeros((len(X_val)))
valid_a = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader_a = torch.utils.data.DataLoader(valid_a, batch_size=32, shuffle=False)

tk0 = tqdm_notebook(valid_loader_a)
for i,(x_batch,)  in enumerate(tk0):
    pred = model_a(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    valid_preds_a[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy()

In [None]:
y_pred=torch.sigmoid(torch.tensor(valid_preds_a)).numpy()
print('AUC score : {:.5f}'.format(roc_auc_score(y_val, y_pred)))

**We see a very small delta in the AUC score when we train on two epochs. This suggests that one epoch is enough considering everything else in our setup.**

## Part B

In [None]:
### PART B ###
output_model_file = "bert_pytorch_part_b.bin"

lr=1e-5
batch_size = 32
accumulation_steps=2
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model_b = BertForSequenceClassification.from_pretrained(WORK_DIR,cache_dir=None,num_labels=1)
model_b.zero_grad()
model_b = model_b.to(device)
param_optimizer = list(model_b.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model_b, optimizer = amp.initialize(model_b, optimizer, opt_level="O1",verbosity=0)
model_b=model_b.train()

In [None]:
### PART B ###
tq = tqdm_notebook(range(EPOCHS))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
    for i,(x_batch, y_batch) in tk0:
#        optimizer.zero_grad()
        y_pred = model_b(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
        loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)

In [None]:
torch.save(model_b.state_dict(), output_model_file)

In [None]:
model_b = BertForSequenceClassification(bert_config,num_labels=1)
model_b.load_state_dict(torch.load(output_model_file ))
model_b.to(device)
for param in model_b.parameters():
    param.requires_grad=False
model_b.eval()
valid_preds = np.zeros((len(X_val)))
valid = torch.utils.data.TensorDataset(torch.tensor(X_val,dtype=torch.long))
valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False)

tk0 = tqdm_notebook(valid_loader)
for i,(x_batch,)  in enumerate(tk0):
    pred = model_b(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    valid_preds[i*32:(i+1)*32]=pred[:,0].detach().cpu().squeeze().numpy()

In [None]:
y_pred=torch.sigmoid(torch.tensor(valid_preds)).numpy()
print('AUC score : {:.5f}'.format(roc_auc_score(y_val, y_pred)))

**We see a very small change in our AUC when we halved the learning rate. This suggests that slowing down the learning rate any further will not yield better results. We could explore increasing the learning rate to speed up our training.**