# Setup Environment, Load Data

In [26]:
#Import Libraries
import os
import json
import csv
import matplotlib.pyplot as plt
import sklearn
import tensorflow
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import re
import torch
import torch.nn as nn
import time
import spacy

#NLP Packages
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.layers import Flatten, LSTM
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Concatenate
from nltk.corpus import stopwords
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
from IPython.display import display, clear_output

In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/yelp-dataset/Dataset_User_Agreement.pdf
/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_checkin.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_user.json


In [7]:
#Load in Data
def load_data(path, filename, truncate=None):
    data_file = open(os.path.join(path, filename))
    data = []
    if truncate is not None:
        for i, line in enumerate(data_file):
            data.append(json.loads(line))
            if i == truncate:
                break
        # for line in data_file[0:truncate]:
        # data.append(json.loads(line))
    else:
        for line in data_file:
            data.append(json.loads(line))
    ret_df = pd.DataFrame(data)
    data_file.close()
    return ret_df

path = "/kaggle/input/yelp-dataset"

print("\nDataset Load Times:\n")
start_time = time.time()
businesses_df = load_data(path, "yelp_academic_dataset_business.json")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Business Load Time: {elapsed_time:.4f} seconds")

num_rev_load = 100000
start_time = time.time()
reviews_df = load_data(path, "yelp_academic_dataset_review.json", truncate=num_rev_load)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Review Load Time: {elapsed_time:.4f} seconds")

start_time = time.time()
tips_df = load_data(path, "yelp_academic_dataset_tip.json")
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tips Load Time: {elapsed_time:.4f} seconds")


Dataset Load Times:

Business Load Time: 3.8558 seconds
Review Load Time: 1.2043 seconds
Tips Load Time: 5.1393 seconds



# Clean Data

Current Thoughts:
* Should we be counting categories weighted based on reviews, or should we just count categories by businesses?
* Do we need to lemmatize for a BERT model considering it has been trained? Also, can lemmatized words be tokenized?

In [8]:
#Filter out Data that Belongs to Businesses Under a Certain Threshold
minimum_business_reviews = 30
reviews_df = reviews_df[reviews_df['business_id'].map(reviews_df['business_id'].value_counts()).gt(minimum_business_reviews)]

#Create Merged DataFrame of Remaining Reviewed Businesses, and Split Categories into List
df_rb = pd.merge(reviews_df, businesses_df, on="business_id")
df_rb['categories'] = df_rb['categories'].str.split(", ")

#Print Metrics
print(f"\nNumber of Total Businesses Represented: {len(df_rb['business_id'].unique())}")
print(f"Number of Total Reviews: {len(df_rb)}")
print(f"Percentage of Reviews Kept: {len(df_rb)/num_rev_load*100}%")


Number of Total Businesses Represented: 718
Number of Total Reviews: 50182
Percentage of Reviews Kept: 50.182%


In [9]:
#Dictionary to Store Categories, and Variables to Report Later
all_unique_cats = {}
total_cats_before = df_rb['categories'].apply(len).sum()
len_before = len(df_rb)

#Iterate through each Business' Categories and Count Occurances (Weighted Based on Reviews)
for cats in df_rb['categories']:
    for cat in cats:
        if cat in all_unique_cats:
            all_unique_cats[cat] += 1
        else:
            all_unique_cats[cat] = 1

#Number of Top Categories to Keep
num_cats = 30
sorted_cats = sorted(all_unique_cats.items(), key=lambda x: x[1], reverse=True)

#Select Top Categories
top_cats = [x[0] for x in sorted_cats[:num_cats]]

#Remove Reviews that Belong to Businesses with None of the Top Categories
df_rb = df_rb[df_rb['categories'].apply(lambda x: any(cat in top_cats for cat in x) if x else False)]

#Remove Categories from Remaining Reviews that are not in the Top Categories
df_rb['categories'] = df_rb['categories'].apply(lambda x: ([i for i in x if i in top_cats]))

#Count Total Number of Categories
total_cats_after = df_rb['categories'].apply(len).sum()

print(f"\nTotal Initial Unique Categories: {len(all_unique_cats)}")
print(f"Number of Total Reviews: {len(df_rb)}")
print(f"Percentage of Reviews Kept: {len(df_rb)/num_rev_load*100}%")
print(f"Average Number of Categories Before: {total_cats_before/len_before}")
print(f"Average Number of Categories After: {total_cats_after/len(df_rb)}")


Total Initial Unique Categories: 274
Number of Total Reviews: 49374
Percentage of Reviews Kept: 49.374%
Average Number of Categories Before: 5.821170937786457
Average Number of Categories After: 4.18432778385385


In [10]:
#Clean review text field
sw = stopwords.words('english')
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.,!¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    html = re.compile(r'<.*?>')
    text = html.sub(r'',text) #Removing html tags
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^,' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text) #removing stopwords
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    return text

df_rb['text'] = df_rb['text'].apply(lambda x: clean_text(x))

In [None]:
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm', disable=["parser","ner"])


def lemmatize(text):
    # Process the text using spaCy
    start_time = time.time()
    doc = nlp(text)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Step 1 Load Time: {elapsed_time:.4f} seconds")
    
    # Extract lemmatized tokens
    start_time = time.time()
    lemmatized_tokens = [token.lemma_ for token in doc]
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Step 2 Load Time: {elapsed_time:.4f} seconds")
    
    # Join the lemmatized tokens into a sentence
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

#Uncomment when running on better computer
#df_rb['text'] = df_rb['text'].apply(lambda x: lemmatize(x))


# BERT Model

References:
* https://github.com/chrizchow/BERTClassifier/blob/main/BertMultiLabelClassifier.ipynb
* https://www.kaggle.com/code/neerajmohan/fine-tuning-bert-for-text-classification#Data-preprocessing

In [12]:
#Create inputs (review text) and outputs (categories)
inputs = df_rb.text.values

#One-Hot Encode
mlb = MultiLabelBinarizer()
outputs = mlb.fit_transform(df_rb['categories']).astype(float)

#Total Number of Unique Labels
num_labels = outputs.shape[1]

In [13]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
#Make sure Tokenizer is Working
print('\nOriginal: {}\n'.format(inputs[0]))

# Print the sentence split into tokens.
print('Tokenized: {}\n'.format(tokenizer.tokenize(inputs[0])))

# Print the sentence mapped to token ids.
print('Token IDs: {}'.format(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inputs[0]))))


Original: wow yummy different delicious favorite lamb curry korma different kinds naan let outside deter almost changed minds go try something new glad

Tokenized: ['wow', 'yu', '##mmy', 'different', 'delicious', 'favorite', 'lamb', 'curry', 'ko', '##rma', 'different', 'kinds', 'na', '##an', 'let', 'outside', 'deter', 'almost', 'changed', 'minds', 'go', 'try', 'something', 'new', 'glad']

Token IDs: [10166, 9805, 18879, 2367, 12090, 5440, 12559, 15478, 12849, 17830, 2367, 7957, 6583, 2319, 2292, 2648, 28283, 2471, 2904, 9273, 2175, 3046, 2242, 2047, 5580]


In [15]:
input_ids = []
attention_masks = []

print('Start Run')
start_time = time.time()

# For every review...
for rev in inputs:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        rev,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        padding = 'max_length',           
                        max_length = 200,           # Pad & truncate all sentences. Reviews over 512 so truncate there for now.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation = True        # Truncate at max_length
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Tokenizer Encoding: {elapsed_time:.4f} seconds")

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
outputs = torch.tensor(outputs).clone().detach()

# Print sentence 0, now as a list of IDs.
print('Original: ', inputs[0])
print('Token IDs:', input_ids[0])
print('Done')

Start Run
Tokenizer Encoding: 90.0699 seconds
Original:  wow yummy different delicious favorite lamb curry korma different kinds naan let outside deter almost changed minds go try something new glad
Token IDs: tensor([  101, 10166,  9805, 18879,  2367, 12090,  5440, 12559, 15478, 12849,
        17830,  2367,  7957,  6583,  2319,  2292,  2648, 28283,  2471,  2904,
         9273,  2175,  3046,  2242,  2047,  5580,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
          

In [78]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, outputs)

# Create a 75-15-10 train-validation-test split.

# Calculate the number of samples to include in each set.
train_size = int(0.75 * len(dataset))
val_size_b4_test = int(len(dataset) - train_size)

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size_b4_test])

val_size = int(0.15 * len(dataset))
test_size = int(len(val_dataset)-val_size)
val_dataset, test_dataset = random_split(val_dataset, [val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} testing samples'.format(test_size))

37,030 training samples
7,406 validation samples
4,938 testing samples


In [79]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [80]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = num_labels, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    problem_type = "multi_label_classification" # Defaults loss function to BCEWithLogitsLoss
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Define Loss function Compatible with Multi-label classification may be redundant givent that "problem_type" specified as multi-label-classification in model
criterion = nn.BCEWithLogitsLoss()

In [82]:
# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [83]:
###########################
# Train with training set #
###########################
def train(model, iterator, optimizer, criterion, device, scheduler, epoch):
    
    # Enter Train Mode
    model.train()
    train_loss = 0

    #Number of iterations equal to total train dataset / batch size
    for step, batch in enumerate(iterator):
        #Print progress in epoch
        print(f"Progress: {step+1}/{len(iterator)}", end='\r')
        # Parse iterator tensor dataset for important information
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        # Generate prediction
        optimizer.zero_grad()
        
        # Compute gradients and update weights
        loss = criterion(output.logits, b_labels) # BCEWithLogitsLoss has sigmoid
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # accumulate train loss
        train_loss += loss
    
    # print completed result
    print()
    print('Train Loss: %f' % (train_loss))
    return train_loss

#############################
# Validate with testing set #
#############################
def test(model, iterator, optimizer, criterion, device, epoch):

    # Enter Evaluation Mode
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for step, batch in enumerate(iterator):
        
            #Print progress in epoch
            print(f"Progress: {step+1}/{len(iterator)}", end='\r')
            
            # Parse iterator tensor dataset for important information
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            # generate prediction
            output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
            prob = output.logits.sigmoid()   # BCEWithLogitsLoss has sigmoid
            
            # record processed data count
            total += (b_labels.size(0)*b_labels.size(1))

            # take the index of the highest prob as prediction output
            THRESHOLD = 0.7
            prediction = prob.detach().clone()
            prediction[prediction > THRESHOLD] = 1
            prediction[prediction <= THRESHOLD] = 0
            correct += prediction.eq(b_labels).sum().item()

        print()
    
    #print completed result
    acc = 100.*correct/total
    print('Correct: %i  / Total: %i / Test Accuracy: %f' % (correct, total, acc))
    return acc

In [None]:
for e in range(epochs):
    
    print(f"\n===== Epoch {e+1}/{epochs} =====")
    
    # training
    print("Training started ...")
    train(model, train_dataloader, optimizer, criterion, device, scheduler, e)

    # validation testing
    print("Testing started ...")
    test(model, validation_dataloader, optimizer, criterion, device, e)

In [None]:
#Load best model
model = torch.load('bert_model')

In [None]:
#Will have to adjust this but not sure how to do so yet without the model - save for later
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()
            
            predictions.extend(list(pred_flat))