In this notebook I will try TextCNN to classify the questions. Unlike the original paper which used two layers of text embeddings, I only used the non-static one.

In [1]:
import os
import re
import jieba
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import TensorDataset

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
roots = {'history' : './data/百度题库/高中_历史/origin/', 
         'geology' : './data/百度题库/高中_地理/origin/',
         'politics' : './data/百度题库/高中_政治/origin/',
         'biology' : './data/百度题库/高中_生物/origin/'}

In [3]:
def load_stop_words(path):
    file = open(path, 'r', encoding='utf-8')
    stopwords = file.readlines()
    stopwords = [word.strip() for word in stopwords]
    return stopwords

stopwords = load_stop_words('./stopwords/stopwords2.txt')

In [4]:
def read_files(root):
    '''
    This function reads in all csv files lies directly under the root directory
    
    Returns the file directories as well as class names (file names)
    '''
    file_names = os.listdir(root)
    file_names = [name for name in file_names if name.endswith('csv')]
    classes = [name.split('.')[0] for name in file_names]
    file_names = [root + name for name in file_names]
    datasets = [pd.read_csv(name) for name in file_names]
    return datasets, classes

In [5]:
def clean_line(line):
    '''
    This function cleans the context
    '''
    line = re.sub(
            "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目", '',line)
    tokens = jieba.cut(line, cut_all=False)
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

In [6]:
def build_dataset(root):
    
    datasets, classes = read_files(root)
    
    for dataset, label in zip(datasets, classes):
        dataset['item'] = dataset['item'].apply(lambda x:clean_line(x))
        dataset['label'] = label
    
    dataset = pd.concat(datasets, ignore_index = True)
    dataset = dataset[['item', 'label']]
        
    return dataset

## TextCNN

Here the parameters that can be tuned about network structure are : 
    * the number of CNN filters and the kernel size of each of them
    * dropout rate
    * Embedding size

In [7]:
class Network(nn.Module):
    
    def __init__(self, window_size_list, word_size, num_classes, pad_token, dropout_rate = 0.1, embedding_size = 300):
        
        super(Network, self).__init__()

        self.embedding = nn.Embedding(word_size, embedding_size, pad_token)
        self.CNN_list = []
        for window_size in window_size_list:
            self.CNN_list.append(nn.Conv2d(1, 1, (window_size, embedding_size)))
        self.fc = nn.Linear(len(window_size_list), num_classes)
        self.output = nn.LogSoftmax(dim = -1)
        self.dropout = nn.Dropout(p = dropout_rate)
        
    def forward(self, sentences):
        
        embedded = self.embedding(sentences)
        embedded = embedded.unsqueeze(1) # add in_channel into shape
        
        feature_list = []
        for cnn_layer in self.CNN_list:
            features = cnn_layer(embedded) # the last dimension should be 1
            features = features.squeeze() # remove the channel dimension and the last dimension
            features = torch.tanh(features) # activation layer
            features, _ = features.max(dim = -1) # MaxPooling
            feature_list.append(features)
            
        features = torch.stack(feature_list, dim = -1) # now shape is bs, max_window_size
        features = self.dropout(features) # dropout for normalization
        
        logits = self.fc(features)
        logits = self.output(logits)
        
        return logits

In [8]:
def sentence_proc(sentence, max_len, word2id):
    
    if len(sentence) > max_len:
        sentence = sentence[:max_len]
    else:
        sentence += ['<PAD>'] * (max_len - len(sentence))
        
    sentence = [word2id.get(word, word2id['<OOV>']) for word in sentence]
    return sentence

In [9]:
def filter_pad_words(texts, max_feature):
    
    word_list = [word for sentence in texts for word in sentence]
    counter = Counter(word_list)
    counter = [(word, count) for word, count in counter.items()]
    counter.sort(key = lambda x : x[1], reverse = True)
    
    valid_words = [word for word, _ in counter[:max_feature]]
    word2id = dict(zip(valid_words, range(1, len(valid_words) + 1) ) )
    word2id['<OOV>'] = 0
    word2id['<PAD>'] = len(word2id)
    
    lens = [len(sentence) for sentence in texts]
    max_len = int(np.mean(lens) + 2 * np.std(lens))
    
    texts = [sentence_proc(sentence, max_len, word2id) for sentence in texts]
    
    return texts, word2id

In [10]:
def train_TextCNN(subject, window_size_list, dropout_rate, NGramRange=1, max_feature=10000, embedding_size = 300, epoch = 20):
    
    print('Reading Data')
    root = roots[subject]
    dataset = build_dataset(root)
    num_topics = len(dataset['label'].unique())
    dataset['item'] = dataset['item'].apply(lambda x:x.split())
    common_texts=dataset['item'].tolist()
    
    print('Cleaning Data')
    common_texts, word2id = filter_pad_words(common_texts, max_feature)
    
    TextCNN = Network(window_size_list, len(word2id), num_topics, len(word2id)-1, dropout_rate = dropout_rate).to(device)
    TextCNN.train()
    optimizer = optim.Adam(TextCNN.parameters(), 0.01)
    
    print('Creating training/testing set')
    label2id = dict(zip(dataset['label'].unique(), range(num_topics)))
    id2label = dict(zip(label2id.values(), label2id.keys()))
    X = np.array(common_texts)
    y = np.array([label2id[label] for label in dataset['label']]).reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.2, 
                                                        random_state = 101)
    
    X_train = torch.tensor(X_train).long()
    y_train = torch.tensor(y_train).long()
    X_test = torch.tensor(X_test).long()
    y_test = torch.tensor(y_test).long()
    train = TensorDataset(X_train, y_train)
    test = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train, 64, True)
    test_loader = DataLoader(test, 64, False)
    
    print('Training\n')
    criterion = nn.NLLLoss()
    for i in range(1, epoch + 1):
        
        log = []
        
        for X_sample, y_sample in iter(train_loader):
            
            X_sample = X_sample.to(device)
            y_sample = y_sample.view(-1).to(device)
            logits = TextCNN(X_sample)
            loss = criterion(logits, y_sample)
            log.append(loss.item())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print('Epoch {}. Average loss {:.4f}'.format(i, np.mean(log)))
        
        if i == 10:
            for param_group in optimizer.param_groups:
                param_group['lr'] = 0.005
                
        if i == 20:
            for param_group in optimizer.param_groups:
                param_group['lr'] = 0.001
        
    print('\nTesting\n')
    predictions = []
    TextCNN.eval()
    with torch.no_grad():
        
        for X_sample, _ in iter(test_loader):
            
            X_sample = X_sample.to(device)
            logits = TextCNN(X_sample)
            _, index = logits.topk(1, 1)
            index = index.view(-1).numpy().tolist()
            predictions += index
    
    y_test = y_test.reshape(-1).tolist()
    y_test = [id2label[ind] for ind in y_test]
    predictions = [id2label[ind] for ind in predictions]
    
    print('\nTest result for {} :'.format(subject))
    print(classification_report(y_test, predictions))
    
    return TextCNN

## Examples

Due to the training speed of CNN, only a small number of filters are used and only one example is provided.

Besides, the learning schedule is changed. Using 0.001 is too small in the begin phase and 20 epochs will not be enough. The learning schedule is now 0.01, 0.005, 0.001 for 10 epochs each.

To get better results, training with GPU with more filters will be necessary

In [11]:
_ = train_TextCNN('history', [2, 2, 3, 3, 4, 4, 5, 6, 7], 0.1, epoch = 30)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/09/k_9rj22d0dgbjd8832nhvlbh0000gn/T/jieba.cache


Reading Data


Loading model cost 0.533 seconds.
Prefix dict has been built succesfully.


Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.0478
Epoch 2. Average loss 0.8822
Epoch 3. Average loss 0.6329
Epoch 4. Average loss 0.4705
Epoch 5. Average loss 0.4015
Epoch 6. Average loss 0.3751
Epoch 7. Average loss 0.3480
Epoch 8. Average loss 0.3543
Epoch 9. Average loss 0.3444
Epoch 10. Average loss 0.3359
Epoch 11. Average loss 0.3326
Epoch 12. Average loss 0.3128
Epoch 13. Average loss 0.2991
Epoch 14. Average loss 0.3150
Epoch 15. Average loss 0.3065
Epoch 16. Average loss 0.3169
Epoch 17. Average loss 0.3076
Epoch 18. Average loss 0.2910
Epoch 19. Average loss 0.3018
Epoch 20. Average loss 0.3050
Epoch 21. Average loss 0.2885
Epoch 22. Average loss 0.2919
Epoch 23. Average loss 0.2929
Epoch 24. Average loss 0.2872
Epoch 25. Average loss 0.2998
Epoch 26. Average loss 0.2836
Epoch 27. Average loss 0.2902
Epoch 28. Average loss 0.2794
Epoch 29. Average loss 0.2849
Epoch 30. Average loss 0.2836

Testing


Test result for history :
              prec