In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [2]:
def decode_prediction(pred):
    label_dict = {'Multiplication': 0, 'Subtraction': 1, 'Addition': 2, 'Division': 3}    
    pred_flat = np.argmax(pred, axis=1).flatten()
    y_pred = [k for k, v in label_dict.items() if pred_flat[0] == v]
    return y_pred

# Clean Input and make into a df

In [3]:
def clean_text(text):
    """
    text: a string

    return: modified initial string
    """
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub('', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ''.join([i for i in text if not i.isdigit()])
    text = " ".join(text.split())
    return text

In [4]:
# # create df from single input (SINGLE ROW)
# user_input = 'In the fridge, there are 4 stacks of chocolate puddings, 7 stacks of brownies and 5 stacks of pasta salad. How many stacks of dessert are there?'

# df = pd.DataFrame([[user_input, '0']], columns=['UserInput', 'label'])

# df = df.astype({'label':'int'})
# df = df.astype({'UserInput':'str'})

# # columns of dataframe
# print(list(df.columns))
# df.head()

# #storing the punctuation free text
# df['Clean']= df['UserInput'].apply(lambda x:clean_text(x))
# df['Clean'][0]
# df.label.dtype

In [41]:
df = pd.read_csv('combined.csv')
df.drop(df.index,inplace=True) 
df.label.dtype

user_input = 'In the fridge, there are 4 stacks of chocolate puddings, 7 stacks of brownies and 5 stacks of pasta salad. How many stacks of dessert are there?'
user_input = clean_text(user_input)
userdf = {"Type": ['Unknown'],
          "Clean": [user_input], 
          "label": [3]
         }
userdf = pd.DataFrame(userdf)

df = pd.concat([df, userdf], ignore_index = True)

df = df.tail(1)
df

Unnamed: 0.1,Unnamed: 0,Type,Clean,label
0,,Unknown,in the fridge there are stacks of chocolate pu...,3


In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [43]:
encoded_data_pred = tokenizer.batch_encode_plus(
    df.Clean.values,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [44]:
input_ids_pred = encoded_data_pred['input_ids']
attention_masks_pred = encoded_data_pred['attention_mask']
labels_pred = torch.tensor(df.label.values)

In [45]:
dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred, labels_pred)
# dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred)

In [46]:
batch_size = 3
device = 'cpu'
dataloader_prediction = DataLoader(dataset_pred)
#                                    sampler=SequentialSampler(dataset_pred))

In [47]:
#  encode values in labels
label_dict = {'Multiplication': 0, 'Subtraction': 1, 'Addition': 2, 'Division': 3}
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)
model.load_state_dict(torch.load('ep2finetuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [48]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        
        # disable gradient calculation
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
#     loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return predictions, true_vals

In [49]:
prediction, true_val = evaluate(dataloader_prediction)
true_val

array([3], dtype=int64)

In [51]:
decode_prediction(prediction)[0]

'Addition'