In [93]:
import pandas as pd
import numpy as np
import time
import torch 
import torch.nn as nn
import torchtext
from torchtext import data

In [83]:
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dir_all_data = '/Users/apple/Desktop/data/sentiment-analysis-on-movie-reviews/train.tsv'
data_all = pd.read_csv(dir_all_data, sep='\t')
print(data_all.keys())

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')


In [84]:
seed = 0
idx = np.arange(len(data_all))
np.random.seed(seed)
np.random.shuffle(idx)

train_size = len(idx * 0.6)
test_size = len(idx * 0.8)

data_all.iloc[idx[:train_size], :].to_csv('/Users/apple/Desktop/data/RNN_train.csv', index = False)
data_all.iloc[idx[train_size:test_size], :].to_csv('/Users/apple/Desktop/data/RNN_test.csv', index = False)
data_all.iloc[idx[test_size:], :].to_csv('/Users/apple/Desktop/data/RNN_val.csv', index = False)

In [85]:
PAD_TOKEN = '<pad>'

TEXT = data.Field(sequential = True, lower = True, batch_first = True, pad_token = PAD_TOKEN)
LABEL = data.Field(sequential = False, batch_first = False)

In [86]:
datafields = [
    ("PhraseId", None),
    ("SentenceId", None),
    ("Phrase", TEXT),
    ("Sentiment", LABEL)
]

train_data = data.TabularDataset(path = '/Users/apple/Desktop/data/RNN_train.csv', format = 'csv', fields = datafields)
test_data = data.TabularDataset(path = '/Users/apple/Desktop/data/RNN_test.csv', format = 'csv', fields = datafields)
val_data = data.TabularDataset(path = '/Users/apple/Desktop/data/RNN_val.csv', format = 'csv', fields = datafields)


In [87]:
TEXT.build_vocab(train_data, vectors = 'glove.6B.50d')
LABEL.build_vocab(train_data)
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [88]:
train_iterator = data.BucketIterator(train_data, batch_size = BATCH_SIZE, train = True, shuffle = True, device = DEVICE)
test_iterator = data.Iterator(test_data, batch_size = BATCH_SIZE, train = False, sort = False, device = DEVICE)
val_iterator = data.Iterator(val_data, batch_size = BATCH_SIZE, train = False, sort = False, device = DEVICE)

In [89]:
embedding_choice = 'glove'
embedding_dim = 50
num_embeddings = len(TEXT.vocab)
word_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
hidden_layer = 50
num_layers = 2
dropout = 0.5


In [90]:
class LSTM(nn.Module):
    def __init__(self):
        super (LSTM, self).__init__()
        self.embedding_choice = embedding_choice
        self.embedding_dim = embedding_dim
        self.num_embeddings = num_embeddings
        self.hidden_layer = hidden_layer
        self.num_layers = num_layers
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_idx = PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze = True)
        self.lstm = nn.LSTM(embedding_dim, hidden_layer, num_layers, batch_first = True, dropout=dropout, bidirectional = True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2 * hidden_layer, label_size)
    
    def forward(self, x):
        c0 = torch.zeros(num_layers * 2, x.size(0), self.hidden_layer).to(DEVICE)
        h0 = torch.zeros(num_layers * 2, x.size(0), self.hidden_layer).to(DEVICE)
        x = self.embedding(x)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out)
        out = torch.cat((out[:, 0, self.hidden_layer: ], out[:, -1, :self.hidden_layer]), dim = 1)
        out = self.fc(out)
        return out

In [95]:
model = LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
criterion = nn.CrossEntropyLoss()
model.to(DEVICE)

LSTM(
  (embedding): Embedding(16533, 50)
  (lstm): LSTM(50, 50, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=100, out_features=7, bias=True)
)

In [None]:
#开始训练
epoch = 1
best_accuracy = 0.0
start_time = time.time()

for i in range(epoch):
    model.train()
    total_loss = 0.0
    accuracy = 0.0
    total_correct = 0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    for batch in train_iterator:
        steps += 1
        optimizer.zero_grad() #  梯度缓存清零
        batch_text = batch.Phrase
        batch_label = batch.Sentiment
        out = model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss += loss.item() 
        loss.backward()
        optimizer.step()        
        correct = (torch.max(out, dim=1)[1] == batch_label).sum()
        total_correct += correct.item()
        if steps % 100 == 0:
            print("Epoch %d_%.3f%%: Training average Loss: %f" 
                  % (i, steps * train_iterator.batch_size * 100 / len(train_iterator.dataset), total_loss / steps))  
    #每个epoch都验证一下
    model.eval()
    total_loss = 0.0
    accuracy = 0.0
    total_correct = 0.0
    total_data_num = len(val_iterator.dataset)
    steps = 0.0    
    for batch in val_iterator:
        steps += 1
        batch_text = batch.Phrase
        batch_label = batch.Sentiment
        out = model(batch_text)
        loss = criterion(out, batch_label)
        total_loss += loss.item()
        correct = (torch.max(out, dim=1)[1] == batch_label).sum()
        total_correct += correct.item()
        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%, Total Time:%f"
          %(i, total_loss / steps, total_correct * 100 / total_data_num, time.time() - start_time))  
    if best_accuracy < total_accuracy / total_data_num:
        best_accuracy = total_accuracy / total_data_num
        torch.save(model, '/Users/apple/desktop/NLP_base/RNN_TEXT_CLASSIFICATION')
        print("Model is saved %f with accuracy %f" % (i, total_accuracy / total_data_num))
        

Epoch 0_2.050%: Training average Loss: 0.963284
Epoch 0_4.101%: Training average Loss: 0.956799
Epoch 0_6.151%: Training average Loss: 0.952459
Epoch 0_8.202%: Training average Loss: 0.955679
Epoch 0_10.252%: Training average Loss: 0.955541
Epoch 0_12.303%: Training average Loss: 0.954844
Epoch 0_14.353%: Training average Loss: 0.955034
Epoch 0_16.404%: Training average Loss: 0.955254
Epoch 0_18.454%: Training average Loss: 0.955624
Epoch 0_20.505%: Training average Loss: 0.954579
Epoch 0_22.555%: Training average Loss: 0.955337
Epoch 0_24.606%: Training average Loss: 0.954604
Epoch 0_26.656%: Training average Loss: 0.953264
Epoch 0_28.707%: Training average Loss: 0.954883
Epoch 0_30.757%: Training average Loss: 0.956191
Epoch 0_32.808%: Training average Loss: 0.954977
Epoch 0_34.858%: Training average Loss: 0.955198
Epoch 0_36.909%: Training average Loss: 0.954853
Epoch 0_38.959%: Training average Loss: 0.955667
Epoch 0_41.010%: Training average Loss: 0.955096
Epoch 0_43.060%: Trainin