In [None]:
from dataloaders import BERTReviewData
from transformers import DistilBertTokenizer
from torch.utils.data import DataLoader
from shared_models import FullBERT, FEATURES
import pandas as pd
import torch
from sklearn import metrics
from ast import literal_eval
import json
import os

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 1e-05

# change to true to run per review
EXPANDED = True

save_dir = 'bert_regular_all'
os.makedirs(save_dir, exist_ok=True)

In [6]:
review_df = pd.read_csv('../data/split/train.csv')
test_df = pd.read_csv('../data/split/test.csv')
val_df = pd.read_csv('../data/split/val.csv')

review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)
val_df['reviews'] = val_df['reviews'].apply(literal_eval)

review_df = review_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
test_df = test_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
val_df = val_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})


label_dict = {0 : 'Yes', 1 : 'No'}
# test classifying at reivew level then resturant level
if EXPANDED:
    review_df = review_df.explode('reviews')
    review_df = review_df.reset_index().drop(columns=['index'])

    test_df = test_df.explode('reviews')
    test_df = test_df.reset_index().drop(columns=['index'])
    
    val_df = val_df.explode('reviews')
    val_df = val_df.reset_index().drop(columns=['index'])
    

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=False)
train_data = BERTReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, labels=label_dict, features=FEATURES)
test_data = BERTReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, labels=label_dict, features=FEATURES)
val_data = BERTReviewData(val_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, label=label_dict, features=FEATURES)

Unnamed: 0,Overall Compliance,name,stars,review_count,is_open,reviews,ratings,n_reviews,avg_rating,IR_regular,...,Cheesesteaks,Middle Eastern,Wineries,Indian,Halal,Vegan,Vegetarian,Beer Bar,Soup,Sushi Bars
0,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,water ice and pretzels were a staple evening t...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
1,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,Love this water ice/ ice cream shop. Used to g...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
2,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,"""I'm bored. It's a gorgeous night. You want to...","[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
3,Yes,South,4.0,501.0,1.0,We came here as I have heard that the food is ...,"[4.0, 5.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 4.0, ...",48,4.229167,1,...,0,0,0,0,0,0,0,0,0,0
4,Yes,South,4.0,501.0,1.0,Wow! Great venue with excellent food. I went t...,"[4.0, 5.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 4.0, ...",48,4.229167,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13587,Yes,Core De Roma,4.0,96.0,1.0,"Very good food, terrible service. Waited for ...","[1.0, 3.0]",2,2.000000,1,...,0,0,0,0,0,0,0,0,0,0
13588,Yes,Core De Roma,4.0,96.0,1.0,"We went here on a Tuesday night around 7pm, no...","[1.0, 3.0]",2,2.000000,1,...,0,0,0,0,0,0,0,0,0,0
13589,Yes,Wholesale Granite Marble & Tile,5.0,27.0,1.0,Wholesale Granite did an amazing job for my of...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
13590,Yes,Wholesale Granite Marble & Tile,5.0,27.0,1.0,From start to finish Wholesale Granite provide...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)
val_loader = DataLoader(val_data, **test_params)

In [16]:
# initialize model and optimizing function
num_features = len(FEATURES)
model = FullBERT(num_features)
model.to(device)

loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params= model.parameters(), lr=LEARNING_RATE)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def validate(data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets = []
    fin_outputs = []

    with torch.no_grad():

        for _, data in enumerate(data_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            features  = data['features'].to(device, dtype = torch.float)

            outputs = model(ids, mask, features)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [18]:
# train the model
train_results = {}

for epoch in range(EPOCHS):
    results = {}
    losses = []
    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        features = data['features'].to(device, dtype = torch.float)

        outputs = model(ids, mask, features)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    preds, targets = validate(testing_loader)
    
    results['preds'] = preds
    results['labels'] = targets
    results['losses'] = losses
    train_results[epoch] = results

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    print(f"Epoch {epoch}: Accuracy Score = {accuracy}")

Epoch: 0, Loss:  5.791101455688477
Epoch: 0, Loss:  9.336081504821777
Epoch: 0, Loss:  5.445563793182373
Epoch: 0, Loss:  4.34694766998291
Epoch: 0, Loss:  11.507637023925781
Epoch: 0, Loss:  9.44841480255127
Epoch: 0, Loss:  4.800806522369385
Epoch: 0, Loss:  9.130693435668945
Epoch: 0, Loss:  8.437676429748535
Epoch 0: Accuracy Score = 0.17911392405063292
Epoch: 1, Loss:  5.000840663909912
Epoch: 1, Loss:  7.234920024871826
Epoch: 1, Loss:  8.992464065551758
Epoch: 1, Loss:  5.892909526824951
Epoch: 1, Loss:  2.989908218383789
Epoch: 1, Loss:  5.925521373748779
Epoch: 1, Loss:  7.750515460968018
Epoch: 1, Loss:  7.445897102355957
Epoch: 1, Loss:  4.566032409667969
Epoch 1: Accuracy Score = 0.4670886075949367
Epoch: 2, Loss:  5.805992603302002
Epoch: 2, Loss:  0.8108724355697632
Epoch: 2, Loss:  3.7756433486938477
Epoch: 2, Loss:  7.860302925109863
Epoch: 2, Loss:  4.2802581787109375
Epoch: 2, Loss:  2.810412883758545
Epoch: 2, Loss:  2.82631778717041
Epoch: 2, Loss:  5.69381618499755

In [19]:
# save training data
with open(f"{save_dir}/train_data.json", "w") as out:
    json.dump(train_results, out)

In [20]:
# evaluate model
test_results = {}
preds, targets = validate(val_loader)

accuracy = metrics.accuracy_score(targets, preds)
print(f"Validation Acc = {accuracy}")

test_results['preds'] = preds
test_results['labels'] = targets

Validation Acc = 0.5628985507246377


In [21]:
# save val data
with open(f"{save_dir}/evaluate_data.json", "w") as out:
    json.dump(test_results, out)