In [136]:
import time
import tqdm 
import pandas as pd 

from __init__ import * 

import torch
import torch.nn as nn 
import torch.optim as optim 


from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

In [137]:
class args:
    seed = 42
    max_seq_length = 256
    batch_size = 2048
    hidden_dim = 256
    num_epochs = 100
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [138]:
def sentiment_score(x):
    if x >= 3.5 : return 1
    elif x < 3.5 : return 0

In [139]:
train_path = os.path.join(YELP_DIR, 'train.csv')
valid_path = os.path.join(YELP_DIR, 'valid.csv')
test_path = os.path.join(YELP_DIR, 'test.csv')

train = pd.read_csv(train_path, encoding='utf-8-sig')
valid = pd.read_csv(valid_path, encoding='utf-8-sig')
test = pd.read_csv(test_path, encoding='utf-8-sig')

In [140]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [141]:
class LSTMDataset(Dataset):
    def __init__(self, args, dataframe, tokenizer):
        self.tokenizer = tokenizer 
        self.data = dataframe 
        self.reviews = dataframe.text 
        self.labels = dataframe.stars
        self.max_seq_length = args.max_seq_length

        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        review = self.reviews[idx]

        inputs = self.tokenizer.encode_plus(
            review, 
            add_special_tokens=False,
            max_length=self.max_seq_length, 
            padding='max_length', 
            return_token_type_ids=False, 
            return_attention_mask=False,
            truncation=True
        )

        input_ids = inputs['input_ids']

        return (
            torch.tensor(input_ids, dtype=torch.long),
            torch.tensor(self.labels[idx], dtype = float) # labels
        )

In [142]:
class LSTMClassifier(nn.Module):
    def __init__(self,vocab_size, embedding_dim, hidden_dim, n_classes, n_layers, bidirectional=False, drop_rate=None):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size = embedding_dim ,
            hidden_size = hidden_dim, 
            num_layers = n_layers, 
            batch_first = True, 
            bidirectional=bidirectional, 
            dropout = drop_rate
        )
        self.drop_rate = drop_rate

        self.fc = nn.Linear(2*hidden_dim if bidirectional else hidden_dim, n_classes)

        self.dropout = nn.Dropout(drop_rate)
        # self._init_weight()
    
    def forward(self, idx):
        embedding = self.embedding(idx)
        if self.drop_rate != 0:
            self.dropout(embedding)
        
        output, (hidden, cell) = self.lstm(embedding)

        if self.lstm.bidirectional:
            output = torch.cat([hidden[-1], hidden[-2]], dim = -1)
            output = self.dropout(output)
        else:
            output = self.dropout(output)
        
        output = self.fc(output)
        return output
    
    def _init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight)
                nn.init.zeros_(m.bias)
            
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'bias' in name:
                        nn.init.zeros_(param)
                    
                    elif 'weight' in name:
                        nn.init.orthogonal_(param)

In [143]:
train.loc[:, 'stars'] = train.loc[:, 'stars'].apply(sentiment_score)
valid.loc[:, 'stars'] = valid.loc[:, 'stars'].apply(sentiment_score)
test.loc[:, 'stars'] = test.loc[:, 'stars'].apply(sentiment_score)


In [144]:
set_seed(args)

trainset = LSTMDataset(args, train, tokenizer)
validset = LSTMDataset(args, valid, tokenizer)
testset = LSTMDataset(args, test, tokenizer)

train_dataloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=4)
valid_dataloader = DataLoader(validset, batch_size=args.batch_size, num_workers=4)

In [145]:
vocab_size = tokenizer.vocab_size
embedding_dim = 512
hidden_dim = 128
output_dim = 1
n_layers = 2 
bidirectional = True 
dr_rate = 0 
lr = 1e-3

models = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dr_rate).to(args.device)

optimizer = optim.Adam(models.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss().to(args.device)

In [146]:
def elapsed_time(start, end):
    elapsed_secs = end - start 
    elapsed_mins = (end - start) // 60
    return elapsed_mins, round(elapsed_secs, 2)

def calc_accuracy(pred_y, true_y):
    pred_y = torch.sigmoid(pred_y)
    return ((pred_y > 0.5) == true_y).float().mean().item()

In [147]:
best_loss = float('inf')

train_loss_list, train_acc_list = [], []
valid_loss_list, valid_acc_list = [], []

for epoch in range(1, args.num_epochs + 1):

    train_loss, train_acc = 0, 0
    valid_loss, valid_acc = 0, 0

    start_time = time.time()

    models.train()
    for reviews, labels in tqdm.tqdm(train_dataloader, desc = 'training...'):
        reviews = reviews.to(args.device)
        labels = labels.to(args.device)

        pred_y = models(reviews).squeeze()
        loss = criterion(pred_y, labels)
        train_acc += calc_accuracy(pred_y, labels)
        train_loss += loss.item() / len(pred_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    models.eval()
    with torch.no_grad():
        for reviews, labels in tqdm.tqdm(valid_dataloader, desc = 'evaluating...'):
            reviews = reviews.to(args.device)
            labels = labels.float().to(args.device)

            pred_y = models(reviews).squeeze()
            loss = criterion(pred_y, labels)
            valid_acc += calc_accuracy(pred_y, labels)
            valid_loss += loss.item() / len(pred_y)

            end_time = time.time()
            elapsed_mins, elapsed_secs = epoch_time(start_time, end_time)


    train_acc /= len(train_dataloader)
    train_loss /= len(train_dataloader)

    valid_acc /= len(valid_dataloader)
    valid_loss /= len(valid_dataloader)

    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)
    valid_loss_list.append(valid_loss)
    valid_acc_list.append(valid_acc)
    print(f'epoch [{epoch}/{args.num_epochs}] | elapsed time: {elapsed_mins}m, {elapsed_secs:.2f}s')
    print(f'train loss: {train_loss:.6f}\ttrain accuracy: {train_acc*100:.2f}%')
    print(f'test loss: {valid_loss:.6f}\ttest accuracy: {valid_acc*100:.2f}% \n')

    if best_loss > valid_loss :
        best_loss = valid_loss
        lstm_path = os.path.join(BASE_DIR, 'baseline_parameters')
        if not os.path.exists(lstm_path):
            os.makedirs(lstm_path)
        torch.save(models.state_dict(), os.path.join(lstm_path, 'lstm_parameters_2.pt'))

results = pd.DataFrame([train_loss_list, valid_loss_list, train_acc_list, valid_acc_list], index = ['train_loss', 'test_loss', 'train_acc', 'test_acc']).T 


save_path = os.path.join(BASE_DIR, 'baseline')
if not os.path.exists(save_path):
    os.makedirs(save_path)

results.to_csv(os.path.join(save_path, 'lstm_results_2.csv'), encoding='utf-8-sig', index=False)

training...:   0%|          | 0/425 [00:08<?, ?it/s]


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.