## LSTM in pytorch

### Reference
* https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

In [155]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Quick View on LSTM

* input_dim: dimension of word vector
* hidden_dim: dimension of hidden layer and cell state
* n_layer: number of layers which is stacked on top of lstm layer

In [2]:
input_dim = 5
hidden_dim = 10
n_layers = 1

lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

* bath_size
* sqe_len: length of input sequence

In [9]:
batch_size = 1
seq_len = 1

# initialize input 
inp = torch.randn(batch_size, seq_len, input_dim)

# initialize hidden state, cell state
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)

# store hidden, cell state in tuple
hidden = (hidden_state, cell_state)

In [10]:
out, hidden = lstm_layer(inp, hidden)
print("Output shape: ", out.shape)
print("Hidden: ", hidden)

Output shape:  torch.Size([1, 1, 10])
Hidden:  (tensor([[[ 0.0873,  0.2074,  0.3934,  0.3425,  0.1360,  0.1399,  0.0492,
          -0.1113,  0.5139, -0.3220]]], grad_fn=<StackBackward>), tensor([[[ 0.1367,  0.3643,  0.4995,  0.7336,  0.1851,  0.5483,  0.0989,
          -0.2053,  0.7556, -0.5570]]], grad_fn=<StackBackward>))


#### wth sequence length 3 ( = when the number of input sentences have 3 words)

In [11]:
seq_len = 3
inp = torch.randn(batch_size, seq_len, input_dim)
out, hidden = lstm_layer(inp, hidden)
print(out.shape)

torch.Size([1, 3, 10])


In [12]:
# if sentiment classification is the goal, take the last output of hidden layer
out = out.squeeze()[-1, :]
print(out.shape)

torch.Size([10])


## Amazon review sentiment analysis (kaggle)

In [132]:
import bz2
from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')

train_file = bz2.BZ2File('./data/amazon_train.bz2')
test_file = bz2.BZ2File('./data/amazon_test.bz2')

train_file = train_file.readlines()
test_file = test_file.readlines()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sbh0613\anaconda\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [133]:
num_train = 10000
num_test = 5000

train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

### Quick Cleaning

In [134]:
# Extracting labels from sentences
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]

# Some simple cleaning of data
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

# Modify URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

### create dictionary that maps each word to its occurrence

In [135]:
words = Counter()  # Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, sentence in enumerate(train_sentences):
    # The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    for word in nltk.word_tokenize(sentence):  # Tokenizing the words
        words.update([word.lower()])  # Converting all the words to lowercase
        train_sentences[i].append(word)
    if i%20000 == 0:
        print(str((i*100)/num_train) + "% done")
print("100% done")

0.0% done
100% done


### to use nn.Embedding()
* remove words that occurr only once
* add vocabulary 'unknown', 'padding'
* create dictionary that maps vocab to integer and vice versa

In [136]:
# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}
# Sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)
# Adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD','_UNK'] + words
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

#### by using word2idx, we can transform sentences consisting of natural language to index integer

In [137]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else 1 for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else 1 for word in nltk.word_tokenize(sentence)]

### pad short sentence by filling with 0 or long sentence by shortening

In [138]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 200  # The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [139]:
train_sentences.shape

(10000, 200)

In [140]:
print(len(train_sentences[0]))
train_sentences[0]

200


array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     1,    87,    17,     3,
           1,    13,    11,   221,   487,    18,   357,    15,     9,
        6009,     3,

### split train, val set

In [141]:
split_frac = 0.5 # 50% validation, 50% test
split_id = int(split_frac * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

In [142]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [143]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [239]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, n_classes, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, n_classes)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # shape of lstm_out: (50, 200, 512)
        
        h_t = lstm_out[:,-1,:]
        h_t = self.dropout(h_t)
        logit = self.fc(h_t)

        return logit, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [240]:
vocab_size = len(word2idx) + 1
n_classes = 2
embedding_dim = 200
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, n_classes, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
# criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [241]:
def train(model, optimizer, train_loader, batch_size, print_every):
    model.train()
    h = model.init_hidden(batch_size)
    
    print('starting training...')
    print('총 {0}개의 훈련 데이터에 대해서 훈련 시작'.format(len(train_loader.dataset.tensors[1])))
    
    counter = 0
    
    for inputs, labels in train_loader:
        
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output, h = model(inputs, h)
        
        labels = labels.long()
        loss = F.cross_entropy(output, labels)
        loss.backward()

        optimizer.step()
        counter += len(labels)
        
        if counter % print_every == 0:
            print('{0}개째 하는 중ㅎㅎ'.format(counter))
        
def evaluate(model, val_loader, batch_size):
    model.eval()
    
    val_h = model.init_hidden(batch_size)
    
    corrects, total_loss = 0, 0
    
    for inp, lab in val_loader:
        val_h = tuple([each.data for each in val_h])
        inp, lab = inp.to(device), lab.to(device)
        out, val_h = model(inp, val_h)
        
        lab = lab.long()
        loss = F.cross_entropy(out, lab, reduction='sum')
        total_loss += loss.item()
        corrects += (out.max(1)[1].view(lab.size()).data == lab.data).sum()
        
        
    size = len(val_loader.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [243]:
import os
# 검증 오차가 가장 적은 최적의 모델을 저장
if not best_val_loss or val_loss < best_val_loss:
    if not os.path.isdir("snapshot"):
        os.makedirs("snapshot")
    torch.save(model.state_dict(), './snapshot/txtclassification.pt')
    best_val_loss = val_loss

In [244]:
epochs = 2
print_every = 100

# best_val_loss = None
for e in range(1, epochs+1):
    train(model, optimizer, train_loader, batch_size, print_every)
    val_loss, val_accuracy = evaluate(model, val_loader, batch_size)

    print("[이폭: %d] 검증 오차:%5.2f | 검증 정확도:%5.2f" % (e, val_loss, val_accuracy))
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss

starting training...
총 10000개의 훈련 데이터에 대해서 훈련 시작
100개째 하는 중ㅎㅎ
200개째 하는 중ㅎㅎ
300개째 하는 중ㅎㅎ
400개째 하는 중ㅎㅎ
500개째 하는 중ㅎㅎ
600개째 하는 중ㅎㅎ
700개째 하는 중ㅎㅎ
800개째 하는 중ㅎㅎ
900개째 하는 중ㅎㅎ
1000개째 하는 중ㅎㅎ
1100개째 하는 중ㅎㅎ
1200개째 하는 중ㅎㅎ
1300개째 하는 중ㅎㅎ
1400개째 하는 중ㅎㅎ
1500개째 하는 중ㅎㅎ
1600개째 하는 중ㅎㅎ
1700개째 하는 중ㅎㅎ
1800개째 하는 중ㅎㅎ
1900개째 하는 중ㅎㅎ
2000개째 하는 중ㅎㅎ
2100개째 하는 중ㅎㅎ
2200개째 하는 중ㅎㅎ
2300개째 하는 중ㅎㅎ
2400개째 하는 중ㅎㅎ
2500개째 하는 중ㅎㅎ
2600개째 하는 중ㅎㅎ
2700개째 하는 중ㅎㅎ
2800개째 하는 중ㅎㅎ
2900개째 하는 중ㅎㅎ
3000개째 하는 중ㅎㅎ
3100개째 하는 중ㅎㅎ
3200개째 하는 중ㅎㅎ
3300개째 하는 중ㅎㅎ
3400개째 하는 중ㅎㅎ
3500개째 하는 중ㅎㅎ
3600개째 하는 중ㅎㅎ
3700개째 하는 중ㅎㅎ
3800개째 하는 중ㅎㅎ
3900개째 하는 중ㅎㅎ
4000개째 하는 중ㅎㅎ
4100개째 하는 중ㅎㅎ
4200개째 하는 중ㅎㅎ
4300개째 하는 중ㅎㅎ
4400개째 하는 중ㅎㅎ
4500개째 하는 중ㅎㅎ
4600개째 하는 중ㅎㅎ
4700개째 하는 중ㅎㅎ
4800개째 하는 중ㅎㅎ
4900개째 하는 중ㅎㅎ
5000개째 하는 중ㅎㅎ
5100개째 하는 중ㅎㅎ
5200개째 하는 중ㅎㅎ
5300개째 하는 중ㅎㅎ
5400개째 하는 중ㅎㅎ
5500개째 하는 중ㅎㅎ
5600개째 하는 중ㅎㅎ
5700개째 하는 중ㅎㅎ
5800개째 하는 중ㅎㅎ
5900개째 하는 중ㅎㅎ
6000개째 하는 중ㅎㅎ
6100개째 하는 중ㅎㅎ
6200개째 하는 중ㅎㅎ
6300개째 하는 중ㅎㅎ
6400개째 하는 중ㅎㅎ
6500개째 하는 중ㅎㅎ
6600개째 하는 중ㅎㅎ
6700개째 하는 중ㅎㅎ
6800개째 하는 중ㅎㅎ
6900개째 하