In [1]:
import torch
import numpy as np
import pandas as pd
import sklearn
BATCH_SIZE=10

### 1 数据提取和划分

In [2]:
train_data=pd.read_csv("train.tsv",sep='\t')
idx=np.arange(train_data.shape[0])
idx

array([     0,      1,      2, ..., 156057, 156058, 156059])

In [3]:
test_data=pd.read_csv("test.tsv",sep='\t')
test_data.shape

(66292, 3)

In [4]:
# 计算训练集、开发集和测试集的大小
np.random.shuffle(idx)
train_size=int(len(idx)*0.6)
test_size=int(len(idx)*0.8)

In [5]:
# 将原数据集划分，生成各自的文件
train_data.iloc[idx[:train_size], :].to_csv('data/cnn_train.csv',index=False)
train_data.iloc[idx[train_size:test_size], :].to_csv("data/cnn_test.csv", index=False)
train_data.iloc[idx[test_size:], :].to_csv("data/cnn_dev.csv", index=False)

In [6]:
test_data.to_csv("data/cnn_pred.csv", index=False)

In [7]:
# torchtext加载数据
from torchtext import data
TEXT = data.Field(sequential=True,batch_first=True,lower=True)
LABEL =data.Field(sequential=False,batch_first=True,unk_token=None)

In [8]:
# 读取数据
datafields = [# 不需要的filed设置为None
    ("PhraseId", None), 
    ("SentenceId", None),
    ('Phrase', TEXT),
    ('Sentiment', LABEL)
]
datafields2 = [# 不需要的filed设置为None
    ("PhraseId", None), 
    ("SentenceId", None),
    ('Phrase', TEXT),
]

In [9]:
train_data=data.TabularDataset(path='data/cnn_train.csv',format='csv',fields=datafields,skip_header=True)
dev_data=data.TabularDataset(path='data/cnn_dev.csv',format='csv',fields=datafields,skip_header=True)
test_data=data.TabularDataset(path='data/cnn_test.csv',format='csv',fields=datafields,skip_header=True)
pred=data.TabularDataset(path='data/cnn_pred.csv',format='csv',fields=datafields2,skip_header=True)

In [10]:
len(pred)

66292

In [11]:
TEXT.build_vocab(train_data,vectors='glove.6B.50d',unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)
PAD_INDEX = TEXT.vocab.stoi['<pad>']
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [12]:
# 迭代器
train_iterator = data.BucketIterator(train_data,batch_size=BATCH_SIZE,train=True,shuffle=True)
dev_iterator = data.BucketIterator(dev_data,batch_size=len(dev_data),train=False,sort=False)
test_iterator = data.BucketIterator(test_data,batch_size=len(test_data),train=False,sort=False)
pred_iterator = data.BucketIterator(pred,batch_size=len(pred),train=False,sort=False)

### 2 模型

In [13]:
# 设置参数
embedding_choice='glove'
num_embeddings=len(TEXT.vocab)
embedding_dim=50
dropoutp=0.5
hidden_size=50  #隐藏单元数
num_layers=2  #层数
vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
vocab_size,label_num

(16464, 5)

In [14]:
# CNN实现
from torch import nn
import torch.nn.functional as F
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        
        self.embedding_choice=embedding_choice        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(num_embeddings, embedding_dim, 
            padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
            batch_first=True,dropout=dropoutp,bidirectional=True)
        self.dropout = nn.Dropout(dropoutp)    
        self.fc = nn.Linear(hidden_size * 2, label_num)
          
    def forward(self,x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
 
        x=self.embedding(x)
        out, _ = self.lstm(x, (h0, c0)) 
        out=self.dropout(out)
        out = self.fc(out[:, -1, :]) 
        return out 

In [15]:
model = LSTM()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [16]:
import time
epoch = 1
best_accuracy = 0.0
start_time = time.time()
for i in range(epoch):
    model.train()
    total_loss = 0.0
    accuracy = 0.0
    total_correct = 0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0

    for batch in train_iterator:
        steps+=1
        optimizer.zero_grad()

        batch_text = batch.Phrase
        batch_label = batch.Sentiment
        out = model(batch_text)
        loss = criterion(out,batch_label)
        total_loss+=loss.item()

        loss.backward()
        optimizer.step()
        correct = (torch.max(out,dim=1)[1].view(batch_label.size())==batch_label).sum()
        total_correct+=correct.item()
        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps))
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(dev_iterator.dataset)
    steps = 0.0    
    for batch in dev_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()
        
        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()
        
        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))  
        
        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num 
            torch.save(model,'model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))

Epoch 0_1.068%:  Training average Loss: 1.359605
Epoch 0_2.136%:  Training average Loss: 1.322123
Epoch 0_3.204%:  Training average Loss: 1.290417
Epoch 0_4.272%:  Training average Loss: 1.267840
Epoch 0_5.340%:  Training average Loss: 1.261175
Epoch 0_6.408%:  Training average Loss: 1.249108
Epoch 0_7.476%:  Training average Loss: 1.244924
Epoch 0_8.544%:  Training average Loss: 1.238481
Epoch 0_9.612%:  Training average Loss: 1.237244
Epoch 0_10.680%:  Training average Loss: 1.235349
Epoch 0_11.748%:  Training average Loss: 1.233563
Epoch 0_12.816%:  Training average Loss: 1.232846
Epoch 0_13.884%:  Training average Loss: 1.231464
Epoch 0_14.952%:  Training average Loss: 1.230923
Epoch 0_16.019%:  Training average Loss: 1.228025
Epoch 0_17.087%:  Training average Loss: 1.225674
Epoch 0_18.155%:  Training average Loss: 1.221756
Epoch 0_19.223%:  Training average Loss: 1.217847
Epoch 0_20.291%:  Training average Loss: 1.216206
Epoch 0_21.359%:  Training average Loss: 1.215820
Epoch 0_2

In [17]:
PATH='model_dict/model_lstm/epoch_0_accuracy_0.581763'
model = torch.load(PATH)
total_loss=0.0
accuracy=0.0
total_correct=0.0
total_data_num = len(train_iterator.dataset)
steps = 0.0    
start_time=time.time()
for batch in test_iterator:
    steps+=1
    batch_text=batch.Phrase
    batch_label=batch.Sentiment
    out=model(batch_text)
    loss = criterion(out, batch_label)
    total_loss = total_loss + loss.item()

    correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
    total_correct = total_correct + correct.item()
    #break   

print("Test average Loss: %f, Test accuracy: %f，Total time: %f"
  %(total_loss/steps, total_correct/total_data_num,time.time()-start_time) ) 

Test average Loss: 1.009824, Test accuracy: 0.194188，Total time: 23.237828


In [20]:
PATH='model_dict/model_lstm/epoch_0_accuracy_0.581763'
model = torch.load(PATH)
model.eval()
with torch.no_grad():
    predicts=[]
    for batch in pred_iterator:
        batch_text=batch.Phrase
        out=model(batch_text)
        predicts.extend(out.argmax(1).cpu().numpy())
    
    test_data=pd.read_csv("test.tsv",sep='\t')
    test_data["Sentiment"]=predicts
    test_data[['PhraseId','Sentiment']].set_index('PhraseId').to_csv('rnn.csv')


In [21]:
len(predicts)

66292