In [None]:
import torch
import matplotlib.pyplot as plt
import lstm_utils

In [None]:
# Learning Rate
lr = 1e-3
running_loss = []
BATCH_SIZE = 32
logs_per_epoch = 10
num_epochs = 14
PAD_BATCH_FLAG = True
use_gpu = torch.cuda.is_available()

json_file = '/home/asilva/Data/yelp_dataset_2017/review.json'
train_percent = 80

In [None]:
raw_data = lstm_utils.load_yelp_data(json_file)
dataset, test_dataset, word_to_id = lstm_utils.build_dataset_yelp(raw_data, train_percent)
vocab = len(word_to_id)

In [None]:
dataset.sort(key=lambda x: len(x[0]), reverse=True)
test_dataset.sort(key=lambda x: len(x[0]), reverse=True)

In [None]:
dataset = lstm_utils.clean_dataset(dataset)
test_dataset = lstm_utils.clean_dataset(test_dataset)

In [None]:
import numpy as np

num_ones = 0
num_twos = 0
num_threes = 0
num_fours = 0
num_fives = 0
lengths = []
for point in dataset:
    rating = point[1][0]
    lengths.append(len(point[0]))
    if rating == 0:
        num_ones += 1.0
    elif rating == 1:
        num_twos += 1.0
    elif rating == 2:
        num_threes += 1.0
    elif rating == 3:
        num_fours += 1.0
    elif rating == 4:
        num_fives += 1.0
perc_ones = 100*(num_ones/len(dataset))
perc_twos = 100*(num_twos/len(dataset))
perc_threes = 100*(num_threes/len(dataset))
perc_fours = 100*(num_fours/len(dataset))
perc_fives = 100*(num_fives/len(dataset))
print("Percent 1:", perc_ones)
print("Percent 2:", perc_twos)
print("Percent 3:", perc_threes)
print("Percent 4:", perc_fours)
print("Percent 5:", perc_fives)
lengths = np.array(lengths)
print("Mean length:", np.mean(lengths))
print("Stddev length:", np.std(lengths))


num_ones = 0
num_twos = 0
num_threes = 0
num_fours = 0
num_fives = 0
lengths = []
for point in test_dataset:
    rating = point[1][0]
    lengths.append(len(point[0]))
    if rating == 0:
        num_ones += 1.0
    elif rating == 1:
        num_twos += 1.0
    elif rating == 2:
        num_threes += 1.0
    elif rating == 3:
        num_fours += 1.0
    elif rating == 4:
        num_fives += 1.0
perc_ones = 100*(num_ones/len(test_dataset))
perc_twos = 100*(num_twos/len(test_dataset))
perc_threes = 100*(num_threes/len(test_dataset))
perc_fours = 100*(num_fours/len(test_dataset))
perc_fives = 100*(num_fives/len(test_dataset))
print("Percent 1:", perc_ones)
print("Percent 2:", perc_twos)
print("Percent 3:", perc_threes)
print("Percent 4:", perc_fours)
print("Percent 5:", perc_fives)
lengths = np.array(lengths)
print("Mean length:", np.mean(lengths))
print("Stddev length:", np.std(lengths))
# OUTPUT OF THIS BLOCK:
# ('Percent 1:', 13.64312871947628)
# ('Percent 2:', 8.460983894496179)
# ('Percent 3:', 11.975605302213054)
# ('Percent 4:', 23.8934195220915)
# ('Percent 5:', 42.026862561722986)
# ('Mean length:', 113.77964893632742)
# ('Stddev length:', 106.17315470136312)
# ('Percent 1:', 12.967181927784615)
# ('Percent 2:', 8.631450705325678)
# ('Percent 3:', 12.348636328125826)
# ('Percent 4:', 24.316263769211442)
# ('Percent 5:', 41.73646726955244)
# ('Mean length:', 114.77220862691111)
# ('Stddev length:', 107.75916040435948)

In [None]:
model = lstm_utils.Net(vocab_size=vocab, 
                       hidden_dim=512, 
                       embed_dim=128, 
                       num_layers=1, 
                       num_classes=5, 
                       dropout=0.4, 
                       batch_size=BATCH_SIZE)
loss_fn = torch.nn.NLLLoss()
if use_gpu:
    model = model.cuda()
    loss_fn = loss_fn.cuda()

# Lists to keep track of average train / test losses over time (for plotting)
train_loss = []
test_loss = []
step_size = BATCH_SIZE if PAD_BATCH_FLAG else 1
train_log_interval = len(dataset)/step_size/logs_per_epoch
test_log_interval = len(test_dataset)/step_size/logs_per_epoch

In [None]:
for epoch in range(1, num_epochs+1):
    # Training over all training data
    print('EPOCH:', epoch)
    if epoch < num_epochs/2:
        lr = 1e-2
    else:
        lr = 1e-3
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.7, 0.99))
    train_loss.extend(lstm_utils.pass_through(model,
                                              loss_fn,
                                              optimizer,
                                              dataset,
                                              batch=PAD_BATCH_FLAG,
                                              batch_size=BATCH_SIZE,
                                              train=True,
                                              log_every=train_log_interval))
    test_loss.extend(lstm_utils.pass_through(model,
                                             loss_fn,
                                             optimizer,
                                             test_dataset,
                                             batch=PAD_BATCH_FLAG,
                                             batch_size=BATCH_SIZE,
                                             train=False,
                                             log_every=test_log_interval))
    # Checkpoint the model with the state_dict, optimizer, and current epoch number
    lstm_utils.save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
    }, filename='yelp_sentiment_epoch'+str(epoch+1))


In [None]:
plt.plot(train_loss, 'r', test_loss, 'b')
