# 文本分类——pytorch版本

In [127]:
import os
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from torch.nn import init
from torchtext.legacy import data

## 读取文件数据函数

In [128]:
def read_data(file_path, sent_col_name, label_col_name):
    data = pd.read_csv(file_path, sep = "\t")
    X = data[sent_col_name].values
    y = data[label_col_name].values
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_df, val_df = pd.DataFrame(), pd.DataFrame()
    train_df["Phrase"], train_df["Sentiment"] = X_train, y_train
    val_df["Phrase"],val_df["Sentiment"] = X_val,y_val

    train_df_path = "data/train.csv"
    val_df_path = "data/val.csv"
    train_df.to_csv(train_df_path, index = False)
    val_df.to_csv(val_df_path, index = False)

    return train_df_path, val_df_path

In [129]:
def data_loader(batch_size):
    X_col_name = "Phrase"
    y_col_name = "Sentiment"
    train_path = "data/train.tsv"
    train_df_path, val_df_path = read_data(train_path,X_col_name,y_col_name)
    
    spacy_en = spacy.load('en_core_web_sm')

    def tokenizer(text): # create a tokenizer function
        """
        定义分词操作
        """
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(sequential = True, tokenize = tokenizer, lower = True)
    LABEL = data.Field(sequential = False, use_vocab = False)
    # TabularDataset: 用来从文件中读取数据，生成Dataset， Dataset是Example实例的集合
    train, val = data.TabularDataset.splits(
        path='', train=train_df_path, validation=val_df_path, format='csv', skip_header=True,
        fields=[("Phrase", TEXT), ("Sentiment", LABEL)])

    # 使用训练集构建单词表
    TEXT.build_vocab(train, vectors='glove.6B.50d')
    TEXT.vocab.vectors.unk_init = init.xavier_uniform
    print(type(TEXT))
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 生成数据迭代器
    train_iter = data.BucketIterator(train, batch_size = batch_size, 
                        sort_key = lambda x: len(x.review), device = DEVICE)
    val_iter = data.BucketIterator(val, batch_size = batch_size, 
                        sort_key = lambda x: len(x.review), shuffle = True, device = DEVICE)
    
    print(type(train_iter))
    return train_iter, val_iter, TEXT.vocab.vectors

In [130]:
import torch.nn as nn
import torch
import torch.nn.functional as F

In [131]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, class_num, filter_num, filter_size, dropout_p):
        super(TextCNN, self).__init__()
        self.embed = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
        # 卷积层
        self.conv1 = nn.Conv2d(in_channels = 1, out_channels = filter_num,
                            kernel_size = (filter_size[0], embedding_dim))
        self.conv2 = nn.Conv2d(in_channels = 1, out_channels = filter_num,
                            kernel_size = (filter_size[1], embedding_dim))
        self.conv3 = nn.Conv2d(in_channels = 1, out_channels = filter_num,
                            kernel_size = (filter_size[2], embedding_dim))
        # dropout
        self.dropout = nn.Dropout(dropout_p)
        # 全连接层
        self.fc = nn.Linear(3 * filter_num, class_num)

    def forward(self, x):
        # x的维度为(Batch_size, Length) 
        # (Batch_size, Length, Dimention) 
        # 增加维度后(Batch_size, 1, Length, Dimention) 
        embed_out = self.embed(x).unsqueeze(1)

        #(Batch_size, filter_num, length+padding, 1) 
        # 降低维度后(Batch_size, filter_num, length+padding) 
        conv1_out = F.relu(self.conv1(embed_out)).squeeze(3)

        #(Batch_size, filters_num, 1)
        # 降低维度后(Batch_size, filters_num) 
        pool1_out = F.max_pool1d(conv1_out, conv1_out.size(2)).squeeze(2)

        conv2_out = F.relu(self.conv2(embed_out)).squeeze(3)
        pool2_out = F.max_pool1d(conv2_out, conv2_out.size(2)).squeeze(2)

        conv3_out = F.relu(self.conv3(embed_out)).squeeze(3)
        pool3_out = F.max_pool1d(conv2_out, conv3_out.size(2)).squeeze(2)

        # (Batch_size, filters_num *3 )
        out_cat = torch.cat((pool1_out,pool2_out,pool3_out), dim = 1)
        out_cat = self.dropout(out_cat)
        out = self.fc(out_cat)

        return out

In [132]:
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, class_num, 
                    rnn_type, hidden_size,dropout_p,num_layers):
        super(TextRNN, self).__init__()
        self.embed = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
        self.rnn_type = rnn_type
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        if rnn_type == "RNN":
            self.rnn = nn.RNN(input_size = embedding_dim, num_layers = num_layers,
                            hidden_size = hidden_size, batch_first=True,dropout = dropout_p)
            #self.dropout = nn.Dropout(dropout_p)    
            self.fc = nn.Linear(hidden_size, class_num)
        elif rnn_type == "LSTM":
             # 双向传播
            self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size, num_layers = num_layers,
                            batch_first=True, bidirectional=True,dropout = dropout_p)
            #self.dropout = nn.Dropout(dropout_p)    
            self.fc = nn.Linear(hidden_size * 2, class_num)


    def forward(self, x):
        # input_sents (batch_size, seq_len)
        batch_size, seq_len = x.shape
        # (batch_size, seq_len, embedding_dim)
        embed_out = self.embed(x)

        if self.rnn_type == "RNN":
            h0 = torch.randn(self.num_layers, batch_size, self.hidden_size)
            output, hn = self.rnn(embed_out, h0)
        elif self.rnn_type == "LSTM":
            h0, c0 = torch.randn(self.num_layers * 2, batch_size, self.hidden_size), torch.randn(self.num_layers * 2, batch_size, self.hidden_size)
            output, (hn, _) = self.lstm(embed_out, (h0, c0))

        #print(output.shape,output[:, -1, :].shape)
        
        out = self.fc(output[:, -1, :]) 
        #print(out.shape)
        return out
        

## 训练模型

In [133]:
from torch import optim
import numpy as np

## 设置超参数

In [134]:
#model_names = ["RNN","LSTM", "CNN"]
model_names = ["CNN"]
learning_rate = 0.001
batch_size = 128
epoch_num = 10
class_num = 5
embedding_dim = 50
filter_num = 100
hidden_size = 50
dropout_p = 0.2
num_layers=2  #层数

## 加载数据

In [135]:
train_iter, val_iter, word_vectors = data_loader(batch_size = batch_size)
# 三种模型轮流训练
for model_name in model_names:
    if model_name == "RNN":
        model = TextRNN(vocab_size = len(word_vectors), embedding_dim = embedding_dim, 
                    rnn_type = "RNN",hidden_size = hidden_size, class_num = class_num,dropout_p = dropout_p,num_layers = num_layers)
    elif model_name == "CNN":
        model = TextCNN(vocab_size=len(word_vectors), embedding_dim = embedding_dim, 
                    class_num = class_num, filter_num = filter_num, filter_size = [3, 4, 5],dropout_p = dropout_p)
    elif model_name == "LSTM":
        model = TextRNN(vocab_size = len(word_vectors), embedding_dim = embedding_dim, 
                    rnn_type = "LSTM",hidden_size = hidden_size, class_num = class_num,dropout_p = dropout_p, num_layers = num_layers)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)# 创建优化器SGD
    criterion = nn.CrossEntropyLoss()   # 损失函数

    
    for epoch in range(epoch_num):
        model.train()
        train_loss = []
        train_accs = []
        for i, batch in enumerate(train_iter):
            x, y = batch.Phrase.t(), batch.Sentiment
            optimizer.zero_grad() # 梯度缓存清零
            predict = model(x)
            loss = criterion(predict, y)
            train_loss.append(loss.item())
            loss.backward()         # 反向传播
            optimizer.step()
            acc = torch.mean((torch.tensor(torch.max(predict,1)[1] == y, dtype=torch.float)))
            train_accs.append(acc)
            #total_correct = total_correct + correct.item()
        train_acc = np.array(train_accs).mean() * 100
        train_loss = np.array(train_loss).mean()

        model.eval()
        val_accs = []
        for i, batch in enumerate(val_iter):
            x, y = batch.Phrase.t(), batch.Sentiment
            predict = model(x)
            acc = torch.mean((torch.tensor(torch.max(predict,1)[1] == y, dtype=torch.float)))
            val_accs.append(acc)
        val_acc = np.array(val_accs).mean() * 100

        print("Epoch %d: Training average accuracy: %.3f%%, Training average Loss: %f,Validation average accuracy: %.3f%%"
                %(epoch, train_acc ,train_loss,val_acc))  



Epoch 0: Training average accuracy: 53.840%, Training average Loss: 1.155302,Validation average accuracy: 57.765%
Epoch 1: Training average accuracy: 60.090%, Training average Loss: 0.994987,Validation average accuracy: 61.290%
Epoch 2: Training average accuracy: 64.119%, Training average Loss: 0.893090,Validation average accuracy: 63.049%
Epoch 3: Training average accuracy: 66.458%, Training average Loss: 0.827922,Validation average accuracy: 64.675%
Epoch 4: Training average accuracy: 68.327%, Training average Loss: 0.781886,Validation average accuracy: 65.238%
Epoch 5: Training average accuracy: 69.616%, Training average Loss: 0.747658,Validation average accuracy: 65.126%
Epoch 6: Training average accuracy: 70.775%, Training average Loss: 0.720210,Validation average accuracy: 66.119%
Epoch 7: Training average accuracy: 71.493%, Training average Loss: 0.698419,Validation average accuracy: 66.142%
Epoch 8: Training average accuracy: 72.484%, Training average Loss: 0.678371,Validation 