In [114]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [115]:
from google.colab import files
uploaded=files.upload()

Saving in_domain_train.tsv to in_domain_train.tsv


In [116]:
!pip install transformers
!pip install torch



In [117]:
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

In [118]:
df = pd.read_csv("in_domain_train.tsv", sep ="\t", names = ['sentence_source', 'label', 'label_notes', 'sentence'])
print(df.shape[0])
sentences = df.sentence.values
labels = df.label.values

8551


In [119]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [120]:
print('Original: ', sentences[0])
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
Token IDs:  [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [121]:
input_ids =list()
attention_masks = list()
for sent in sentences:
  encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length = 64, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [122]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9*len(dataset))
val_size = len(dataset)- train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,695 training samples
  856 validation samples


In [123]:
batch_size = 32
train_loader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
val_loader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

In [124]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [125]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [126]:
epochs = 4
total_steps = len(train_loader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps =0, num_training_steps = total_steps)

In [127]:
import random
import numpy as np
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
for i in range(0, epochs):
  model.train()
  for step,batch in enumerate(train_loader):
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2]
    model.zero_grad()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
  model.eval()
  for batch in val_loader:
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2]

In [128]:
uploaded=files.upload()

Saving out_of_domain_dev.tsv to out_of_domain_dev.tsv


In [129]:
df = pd.read_csv("out_of_domain_dev.tsv",sep = '\t', names = ['a','b','c','d'])
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

Number of test sentences: 516



In [130]:
sentences = df['d'].tolist()
labels = df['b'].tolist()
input_ids = []
attention_masks = []
for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 64,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
batch_size = 32 
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_loader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size)



In [131]:
model.eval()
predictions, true_labels = [], []
for batch in pred_loader:
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    outputs = model(b_input_ids, token_type_ids = None, attention_mask = b_input_mask)

In [132]:
logits = outputs[0]
logits = logits.detach().numpy()
label_ids = b_labels.numpy()

In [133]:
predictions.append(logits)
true_labels.append(label_ids)

In [134]:
print(predictions, true_labels)

[array([[ 0.02973971, -0.43035555],
       [ 0.04980102, -0.48208573],
       [ 0.09929547, -0.29612836],
       [ 0.02587619, -0.36270148]], dtype=float32)] [array([1, 0, 1, 1])]


In [135]:
import os
output_dir = '/content/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.txt',
 '/content/added_tokens.json')

In [136]:
model_loaded = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

In [167]:
sent = "this sentence is correct"
encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
input_id = encoded_dict['input_ids']
    
    # And its attention mask (simply differentiates padding from non-padding).
attention_mask = encoded_dict['attention_mask']
input_id = torch.LongTensor(input_id)
attention_mask = torch.LongTensor(attention_mask)



In [168]:
with torch.no_grad():
  # Forward pass, calculate logit predictions
  outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)

logits = outputs[0]
index = logits.argmax()
if index == 1:
  print("Gramatically correct")
else:
  print("Gramatically in-correct")


Gramatically in-correct
