In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [2]:
def decode_prediction(pred):
    label_dict = {'Multiplication': 0, 'Subtraction': 1, 'Addition': 2, 'Division': 3}    
    pred_flat = np.argmax(pred, axis=1).flatten()
    y_pred = [k for k, v in label_dict.items() if pred_flat[0] == v]
    return y_pred

# Clean Input and make into a df

In [3]:
def clean_text(text):
    """
    text: a string

    return: modified initial string
    """
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub('', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ''.join([i for i in text if not i.isdigit()])
    text = " ".join(text.split())
    return text

In [None]:
# # create df from single input (SINGLE ROW)
# user_input = 'In the fridge, there are 4 stacks of chocolate puddings, 7 stacks of brownies and 5 stacks of pasta salad. How many stacks of dessert are there?'

# df = pd.DataFrame([[user_input, '0']], columns=['UserInput', 'label'])

# df = df.astype({'label':'int'})
# df = df.astype({'UserInput':'str'})

# # columns of dataframe
# print(list(df.columns))
# df.head()

# #storing the punctuation free text
# df['Clean']= df['UserInput'].apply(lambda x:clean_text(x))
# df['Clean'][0]
# df.label.dtype

In [6]:
df = pd.read_csv('combined.csv')
df = df.tail(1)
df

Unnamed: 0.1,Unnamed: 0,Type,Clean,label
1561,239,Subtraction,zachary did pushups in gym class today david d...,1


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [None]:
encoded_data_pred = tokenizer.batch_encode_plus(
    df.Clean.values,
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

In [None]:
input_ids_pred = encoded_data_pred['input_ids']
attention_masks_pred = encoded_data_pred['attention_mask']
labels_pred = torch.tensor(df.label.values)

In [None]:
dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred, labels_pred)
# dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred)

In [None]:
batch_size = 3
device = 'cpu'
dataloader_prediction = DataLoader(dataset_pred)
#                                    sampler=SequentialSampler(dataset_pred))

In [None]:
#  encode values in labels
label_dict = {'Multiplication': 0, 'Subtraction': 1, 'Addition': 2, 'Division': 3}
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)
model.load_state_dict(torch.load('ep2finetuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        
        # disable gradient calculation
        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
#     loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return predictions, true_vals

In [None]:
prediction, true_val = evaluate(dataloader_prediction)
true_val

In [None]:
decode_prediction(prediction)