In [59]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import log_softmax
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from ast import literal_eval
import re

In [60]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [45]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

# change to true to run per review
EXPANDED = False

In [46]:
review_df = pd.read_csv('../data/split/train.csv')
test_df = pd.read_csv('../data/split/test.csv')
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
if EXPANDED:
    review_df = review_df.explode('reviews')
    review_df = review_df.reset_index().drop(columns=['index'])

    test_df = test_df.explode('reviews')
    test_df = test_df.reset_index().drop(columns=['index'])
    
review_df

Unnamed: 0,Overall Compliance,name,stars,review_count,is_open,reviews,ratings,n_reviews,avg_rating,IR_regular,...,Cheesesteaks,Middle Eastern,Wineries,Indian,Halal,Vegan,Vegetarian,Beer Bar,Soup,Sushi Bars
0,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,[water ice and pretzels were a staple evening ...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
1,Yes,South,4.0,501.0,1.0,[We came here as I have heard that the food is...,"[4.0, 5.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 4.0, ...",48,4.229167,1,...,0,0,0,0,0,0,0,0,0,0
2,No,Berwyn Pizza,4.0,64.0,1.0,[(3.5) ~ good overall food service.\n\nMENU:\n...,"[3.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0]",8,4.625000,1,...,0,0,0,0,0,0,0,0,0,0
3,Yes,Cedar Hollow Inn Restaurant & Bar,3.5,73.0,1.0,[Thank you cedar hollow for donating to the Ep...,"[5.0, 5.0]",2,5.000000,0,...,0,0,0,0,0,0,0,0,0,0
4,No,Spunktown Tavern,3.5,16.0,1.0,[Don't go here. Period. There's not going to b...,[2.0],1,2.000000,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1749,Yes,Yardley General,4.5,6.0,1.0,"['Round back there is a bar so small, so delic...","[5.0, 3.0, 5.0, 5.0]",4,4.500000,1,...,0,0,1,0,0,0,0,0,0,0
1750,No,One Stop Party Shop,4.0,5.0,1.0,[Let me tell you!!!!! This place is amazing!\...,[5.0],1,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
1751,Yes,Chelsy's,4.0,9.0,1.0,[If you look up dive bar in the dictionary thi...,"[4.0, 5.0]",2,4.500000,1,...,0,0,0,0,0,0,0,1,0,0
1752,Yes,Core De Roma,4.0,96.0,1.0,"[Very good food, terrible service. Waited for...","[1.0, 3.0]",2,2.000000,1,...,0,0,0,0,0,0,0,0,0,0


In [61]:
#mode info
model_checkpoint = "distilbert/distilbert-base-uncased"

In [62]:
#categorial features
FEATURES = [
    "stars",
    "review_count",
    "is_open",
    "n_reviews",
    "avg_rating",
    "IR_regular",
    "IR_follow_up",
    "IR_other",
    "Chester",
    "Bucks",
    "Philadelphia",
    "Delaware",
    "Montgomery",
    "Berks",
    'Nightlife',
    'Bars',
    'Pizza',
    'Italian',
    'Sandwiches',
    'Breakfast & Brunch',
    'Cafes',
    'Burgers',
    'Delis',
    'Caterers',
    'Mexican',
    'Desserts',
    'Salad',
    'Sports Bars',
    'Pubs',
    'Chicken Wings',
    'Seafood',
    'Beer',
    'Wine & Spirits',
    'Juice Bars & Smoothies',
    'Mediterranean',
    'Gastropubs',
    'Diners',
    'Steakhouses',
    'Breweries',
    'Donuts',
    'Barbeque',
    'Cheesesteaks',
    'Middle Eastern',
    'Wineries',
    'Indian',
    'Halal',
    'Vegan',
    'Vegetarian',
    'Beer Bar',
    'Soup',
    'Sushi Bars'
    ]


In [63]:
class ReviewData(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer : DistilBertTokenizer, max_tokens: int, expanded: bool = False, features: list = []):
        self.tokenizer = tokenizer
        self.df = df
        self.max_tokens = max_tokens
        self.expanded = expanded
        self.review_text = self.clean_text(self.df)
        self.target_cat = self.df['Overall Compliance']
        self.features = features

    def clean_text(self, df: pd.DataFrame) -> pd.Series:

        def clean_reviews(reviews):
            cleaned = []
            for review in reviews:
                review = review.replace('\n', ' ')
                cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so
            return cleaned

        if self.expanded:
            df['reviews'] = df['reviews'].str.strip()
            df['reviews'] = df['reviews'].str.replace('\n', ' ')
            df['reviews'] = df['reviews'].str.replace(r"[^a-zA-Z0-9]", ' ', regex=True)
            return df['reviews']
        return df['reviews'].apply(clean_reviews)


    def __len__(self):
        return len(self.review_text)

    def __getitem__(self, index):
        review_text = str(self.review_text[index])
        target_cat = self.target_cat[index]

        cat_features = []
        if self.features:
            cat_features = self.df[self.features].iloc[index].values
            
        if not self.expanded:
            # combine all reviews into one string
            review_text = " ".join(self.review_text[index])

        inputs = self.tokenizer.encode_plus(
            review_text,
            None,
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding='max_length',
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # [0, 1] = pass, [1, 0] = fail
        target = []
        if target_cat == 'No':
            target = [1, 0]
        else:
            target = [0, 1]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'features': torch.tensor(cat_features, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [64]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_data = ReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, features=FEATURES)
test_data = ReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, features=FEATURES)

In [65]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

In [66]:
# BERT-based model

class BERTAndErnie(torch.nn.Module):
    def __init__(self, num_features):
        super(BERTAndErnie, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 50)

        #play with different activation functions
        self.l4 = torch.nn.ReLU()
        # feature embeddings + 
        self.l5 = torch.nn.Linear(50 + num_features, 2)

    def forward(self, ids, mask, features):
        out1 = self.l1(ids, attention_mask=mask, return_dict=False)
        # pull out tensor and reshape
        hidden_layer = out1[0]
   
        #regularize with dropout
        hidden_layer = self.l2(hidden_layer)
   
        # reshape to 16 x 768
        bert_out = hidden_layer[:, 0]
        print('step 1')
        #drop to 16 X 2
        
        bert_out = self.l3(bert_out)
        print('step 2')

        # concate with BERT embeddings and apply non-linear
        combined = self.l4(torch.cat((bert_out, features), dim=1)) 

        #collapse to label
        out = self.l5(combined)

        return out

In [67]:
# check with Claire to standardize
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [68]:
# initialize model and optimizing function
num_features = len(FEATURES)
model = BERTAndErnie(num_features)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [69]:
def validate():
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            features  = data['features'].to(device, dtype = torch.float)

            outputs = model(ids, mask, features)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
# train the model
saved_outputs = []
saved_accuracy = []

for epoch in range(EPOCHS):

    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        features = data['features'].to(device, dtype = torch.float)

        # print('cat input', features.size())
        outputs = model(ids, mask, features)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    preds, targets = validate()

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    saved_accuracy.append(accuracy)
    print(f"Accuracy Score = {accuracy}")