In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data,datasets
print('GPU:', torch.cuda.is_available())

torch.manual_seed(123)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype = torch.float)
train_data,test_data = datasets.IMDB.splits(TEXT,LABEL)
#都是25000条数据
print('len of train data:', len(train_data))
print('len of test data:', len(test_data))

print(train_data.examples[15].text)
print(train_data.examples[15].label)


GPU: True
len of train data: 25000
len of test data: 25000
['Piece', 'of', 'subtle', 'art', '.', 'Maybe', 'a', 'masterpiece', '.', 'Doubtlessly', 'a', 'special', 'story', 'about', 'the', 'ambiguity', 'of', 'existence', '.', 'Tale', 'in', 'Kafka', 'style', 'about', 'impossibility', 'of', 'victory', 'or', 'surviving', 'in', 'a', 'perpetual', 'strange', 'world', '.', 'The', 'life', 'is', ',', 'in', 'this', 'film', ',', 'only', 'exercise', 'of', 'adaptation', '.', 'Lesson', 'about', 'limits', 'and', 'original', 'sin', ',', 'about', 'the', 'frailty', 'of', 'innocence', 'and', 'error', 'of', 'his', 'ways.<br', '/><br', '/>Leopold', 'Kessle', 'is', 'another', 'Joseph', 'K.', 'Images', 'of', 'Trial', 'and', 'same', 'ambiguous', 'woman', '.', 'And', 'Europa', 'is', 'symbol', 'of', 'basic', 'crisis', 'who', 'has', 'many', 'aspects', 'like', 'chimeric', 'wars', 'or', 'unavailing', 'search', 'of', 'truth', '/', 'essence', '/', 'golden', 'age.<br', '/><br', '/>Methaphor', 'or', 'parable', ',', 'the

In [36]:
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)

batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = batchsz
    #device=device
)

http://nlp.stanford.edu/data/glove.6B.zip
-----------------------------
None


In [40]:
class RNN(nn.Module):
    def __init__(self,vocab_size,embedding_size,hidden_size):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size,hidden_size=hidden_size,num_layers = 2,bidirectional=True,dropout = 0.5)
        self.fc = nn.Linear(hidden_size*2,1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self,x):
        #x:seq_len batch_size 
        embedding = self.dropout(self.embedding(x))#seq_len batch_size embedding_size
        #output (seq_len,batch_size,hidden_size) 
        #hidden (num_layers,batch_size,hidden_size)
        #cell同上
        output,(hidden,cell) = self.lstm(embedding)
        #hidden (batch_size,hidden_size*2)
        hidden = torch.cat([hidden[-2],hidden[-1]],dim=1)
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out

In [41]:
model = RNN(len(TEXT.vocab),100,256)

pretrained_embedding = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embedding)

optimizer = optim.Adam(model.parameters(),lr=3e-4)
criterion = nn.BCEWithLogitsLoss().to(device)


In [None]:
def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

for i in range(10):
    print('epoch:{}'.format(i))
    avg_acc = []
    model.train()
    for index,data in enumerate(train_iterator):
    #     print(index)
    #     print(data.text.shape)#torch.Size([927, 30]) torch.Size([949, 30])...
        logits = model(data.text).squeeze(1)
        loss = criterion(logits,data.label)
        acc = binary_acc(logits,data.label).item()
        avg_acc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if index % 100 ==0:
            print('[{}/{}] acc:{}'.format(index,len(train_iterator),acc))

    avg_acc = np.array(avg_acc).mean()
    print('avg acc:', avg_acc) 
    
    avg_acc = []
    rnn.eval()
    with torch.no_grad():
        for batch in test_iterator:
            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)
            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
    avg_acc = np.array(avg_acc).mean()
    print('>>test:', avg_acc)

epoch:0
[0/834] acc:0.36666667461395264
