## import packages

In [57]:
import time, os
import tqdm 
import pandas as pd 
import numpy as np 
import random 

import torch
from torch import nn, optim 

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader 
from transformers import BertTokenizer, BertModel

## Load dataset

In [42]:
data_path = os.path.join('data', 'yelp_sample.csv')
dataset = pd.read_csv(data_path, encoding='utf-8-sig')

## Preprocessing

In [43]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
train, valid = train_test_split(dataset, test_size=0.1, random_state=42)

In [44]:
def sentiment_score(x):
    if x >= 3.5 : return 1
    elif x < 3.5 : return 0

In [45]:
train.user_id = train.loc[:, 'user_id'].astype('category')
train.business_id = train.loc[:, 'business_id'].astype('category')

total_user_category = train.loc[:, 'user_id'].cat.categories
total_rest_category = train.loc[:, 'business_id'].cat.categories

valid.user_id = valid.loc[:, 'user_id'].astype('category')
valid.business_id = valid.loc[:, 'business_id'].astype('category')

test.user_id = test.loc[:, 'user_id'].astype('category')
test.business_id = test.loc[:, 'business_id'].astype('category')

valid.user_id= valid.loc[:, 'user_id'].cat.set_categories(total_user_category)
valid.business_id = valid.loc[:, 'business_id'].cat.set_categories(total_rest_category)

test.user_id = test.loc[:, 'user_id'].cat.set_categories(total_user_category)
test.business_id = test.loc[:, 'business_id'].cat.set_categories(total_rest_category)

train.user_id = train.loc[:, 'user_id'].cat.codes
train.business_id = train.loc[:, 'business_id'].cat.codes

valid.user_id = valid.loc[:, 'user_id'].cat.codes
valid.business_id = valid.loc[:, 'business_id'].cat.codes

test.user_id = test.loc[:, 'user_id'].cat.codes
test.business_id = test.loc[:, 'business_id'].cat.codes

train.loc[:,'stars'] = train.loc[:, 'stars'].apply(sentiment_score)
valid.loc[:, 'stars'] = valid.loc[:, 'stars'].apply(sentiment_score)
test.loc[:, 'stars'] = test.loc[:, 'stars'].apply(sentiment_score)

train = train.dropna().reset_index(drop=True)
valid = valid.dropna().reset_index(drop=True)
test = test.dropna().reset_index(drop=True)

In [46]:
train.loc[:, 'text'] = train.loc[:, 'text'].apply(lambda x: '[CLS] ' + str(x) + ' [SEP]' )
valid.loc[:, 'text'] = valid.loc[:, 'text'].apply(lambda x: '[CLS] ' + str(x) + ' [SEP]' )
test.loc[:, 'text'] = test.loc[:, 'text'].apply(lambda x: '[CLS] ' + str(x) + ' [SEP]' )

## Load model and tokenizer

In [47]:
bert_model = BertModel.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertModel: ['fit_denses.1.weight', 'fit_denses.4.weight', 'cls.seq_relationship.weight', 'fit_denses.2.bias', 'cls.predictions.transform.LayerNorm.bias', 'fit_denses.3.bias', 'fit_denses.0.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'fit_denses.4.bias', 'cls.predictions.transform.dense.weight', 'fit_denses.3.weight', 'cls.predictions.bias', 'fit_denses.2.weight', 'cls.predictions.transform.LayerNorm.weight', 'fit_denses.0.weight', 'cls.predictions.transform.dense.bias', 'fit_denses.1.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing

In [50]:
class BERTClassifier(nn.Module):
    def __init__(self, bertmodel):
        super(BERTClassifier, self).__init__()
        self.bert = bertmodel
        self.classifier = nn.Linear(312, 1) # tinybert=312, bert-base=768
        self.dropout = nn.Dropout(0.3)

    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        pooler = output['pooler_output']
        pooler = self.dropout(pooler)
        fc_layer = self.classifier(pooler)
        return fc_layer 

In [51]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BERTClassifier(bert_model)
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=Tru

## Build Dataset and DataLoader

In [54]:
class BERTDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer 
        self.data = dataframe 
        self.reviews = dataframe.text 
        self.labels = dataframe.stars
        self.max_seq_length = 512

        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        review = self.reviews[idx]

        inputs = self.tokenizer.encode_plus(
            review, 
            None,
            add_special_tokens=True, 
            max_length=self.max_seq_length, 
            padding='max_length', 
            return_token_type_ids=True, 
            truncation=True
        )

        input_ids = inputs['input_ids']
        masks = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return (
            torch.tensor(input_ids, dtype=torch.long), # token_ids
            torch.tensor(masks, dtype=torch.long), # attention_mask
            torch.tensor(token_type_ids, dtype=torch.long), # token_type_ids
            torch.tensor(self.labels[idx], dtype = float) # labels
        )

In [55]:
train_dataset = BERTDataset( train, tokenizer)
valid_dataset = BERTDataset( valid, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=1)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, num_workers=1)

## Training

In [60]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time 
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = elapsed_time - elapsed_mins*60 

    return elapsed_mins, elapsed_secs

def calc_accuracy(pred_y, true_y):
    return ((pred_y > 0.5) == true_y).sum().detach().cpu().item()

In [61]:
num_epochs = 50
learning_rate = 1e-5

criterion = nn.BCEWithLogitsLoss().to(device)
optimizer =optim.Adam(model.parameters(), lr = learning_rate, weight_decay = 0.001)

In [62]:
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

best_loss = float('inf')
set_seed(42)
for epoch in range(1, num_epochs+1):
    train_loss = 0 
    train_acc = 0

    test_loss = 0
    test_acc = 0 

    model.train()
    start_time = time.time()
    for batch in tqdm.tqdm(train_dataloader, desc = 'training...'):
        batch = tuple(t.to(device) for t in batch)

        inputs = {'input_ids':      batch[0], 
                  'attention_mask': batch[1], 
                  'token_type_ids':     batch[2]}
        label = batch[3]

        pred_y = model(**inputs).squeeze()
        loss = criterion(pred_y, label)

        train_acc += calc_accuracy(pred_y, label)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_acc /= len(train_dataset)
    train_loss /= len(train_dataset)
    train_acc_list.append(train_acc)
    train_loss_list.append(train_loss)

    model.eval()
    with torch.no_grad():
        for batch in tqdm.tqdm(valid_dataloader, desc = 'evaluating...'):
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {'input_ids':      batch[0], 
                      'attention_mask': batch[1], 
                      'token_type_ids':     batch[2]}
            label = batch[3]

            pred_y = model(**inputs).squeeze()
            loss = criterion(pred_y, label)

            test_acc += calc_accuracy(pred_y, label)
            test_loss += loss.item()

    test_acc /= len(valid_dataset)
    test_loss /= len(valid_dataset)
    test_acc_list.append(test_acc)
    test_loss_list.append(test_loss)


    end_time = time.time()
    elapsed_mins, elapsed_secs = epoch_time(start_time, end_time)
    print(f'epoch [{epoch}/{num_epochs}], elapsed time: {elapsed_mins}m, {elapsed_secs:.2f}s')
    print(f'train loss: {train_loss:.4f}\ttrain accuracy: {train_acc*100:.2f}%')
    print(f'test loss: {test_loss:.4f}\ttest accuracy: {test_acc*100:.2f}%\n')

    if best_loss > test_loss :
        best_loss = test_loss 

        torch.save(model.state_dict(), 'model_parameters.pt')
    
df = pd.DataFrame([train_loss_list, test_loss_list, train_acc_list, test_acc_list], index = ['train_loss', 'test_loss', 'train_acc', 'test_acc']).T 

df.to_csv(f'results.csv', encoding='utf-8-sig', index=False)

training...:   0%|          | 0/282 [00:00<?, ?it/s]