In [1]:
import os
import re
import jieba
import pandas as pd
import numpy as np

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from itertools import permutations

import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import TensorDataset

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
jieba.load_userdict('./stopwords/Special_words.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/09/k_9rj22d0dgbjd8832nhvlbh0000gn/T/jieba.cache
Loading model cost 0.586 seconds.
Prefix dict has been built succesfully.


In [3]:
roots = {'history' : './data/百度题库/高中_历史/origin/', 
         'geology' : './data/百度题库/高中_地理/origin/',
         'politics' : './data/百度题库/高中_政治/origin/',
         'biology' : './data/百度题库/高中_生物/origin/'}

In [4]:
def read_files(root):
    '''
    This function reads in all csv files lies directly under the root directory
    
    Returns the file directories as well as class names (file names)
    '''
    file_names = os.listdir(root)
    file_names = [name for name in file_names if name.endswith('csv')]
    classes = [name.split('.')[0] for name in file_names]
    file_names = [root + name for name in file_names]
    datasets = [pd.read_csv(name) for name in file_names]
    return datasets, classes

In [5]:
def load_stop_words(path):
    file = open(path, 'r', encoding='utf-8')
    stopwords = file.readlines()
    stopwords = [word.strip() for word in stopwords]
    return stopwords

stopwords = load_stop_words('./stopwords/stopwords2.txt')

In [6]:
remove = "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目|排除|选项|知识点"
def clean_sentence(line):
    '''
    This function cleans the context
    '''
    line = re.sub(remove, '', line)
    tokens = jieba.cut(line, cut_all=False)
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

In [7]:
def clean_line(line):
    part1, part2 = line.split('题型', 1) # part 1 is 题目
    part2, part3 = part2.split('解析', 1) # part 2 is abanddoned
    part3 = part3.split('解析')[1]
    try:
        part3, part4 = part3.split('知识点', 1) # part 3 is 解析, part 4 is 知识点
    except ValueError:
        part4 = ''
    result = []
    for line in [part1, part3, part4]:
        result.append(clean_sentence(line))
    return result

In [8]:
def build_dataset(root):
    
    datasets, classes = read_files(root)
    
    for dataset, label in zip(datasets, classes):
        dataset['item'] = dataset['item'].apply(lambda x : clean_line(x))
        dataset['question'] = dataset['item'].apply(lambda x : x[0]).apply(lambda x : x.split())
        dataset['solution'] = dataset['item'].apply(lambda x : x[1]).apply(lambda x : x.split())
        dataset['keypoints'] = dataset['item'].apply(lambda x : x[2]).apply(lambda x : x.split())
        dataset['item'] = dataset['item'].apply(lambda x : ' '.join(x)).apply(lambda x : x.split())
        dataset['label'] = label
    
    dataset = pd.concat(datasets, ignore_index = True)
    dataset = dataset[['item', 'question', 'solution', 'keypoints', 'label']]
        
    return dataset

## LDA

The topics in LDA are numbers but the label are strings. The correspondence might change among different runs. After different runs I realized that the results are not stable at all.

In [9]:
def train_LDA(root):
    
    dataset = build_dataset(root)
    num_topics = len(dataset['label'].unique())
    common_texts=dataset['item'].tolist()
    
    dictionary = Dictionary(common_texts)
    corpus = [dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
    
    predictions = [lda.get_document_topics(text) for text in corpus]
    
    dataset['prediction'] = predictions
    
    labels = dataset['label'].unique()
    pred = list(range(num_topics))
    predictions = dataset['prediction'].apply(lambda x : sorted(x, key = lambda x : x[1], reverse=True)[0][0])
    
    best_match = 0
    for match in permutations(pred):
        num_to_pred = dict(zip(match, labels))
        temp_prediction = predictions.apply(lambda x : num_to_pred[x])
        correct = sum(temp_prediction.values == dataset['label'].values)
        if correct > best_match:
            best_match = correct
            final_prediction = temp_prediction.values.copy()
    
    print(classification_report(dataset['label'].values, final_prediction))
    
    return dataset[['item', 'label', 'prediction']]

In [10]:
history = train_LDA(roots['history'])
history.head()

              precision    recall  f1-score   support

         古代史       0.51      0.91      0.65      1000
         现代史       0.54      0.43      0.48      2330
         近代史       0.41      0.33      0.36      1640

    accuracy                           0.50      4970
   macro avg       0.49      0.56      0.50      4970
weighted avg       0.49      0.50      0.48      4970



Unnamed: 0,item,label,prediction
0,"[左传, 记载, 春秋, 后期, 鲁国, 大夫, 季孙氏, 家臣, 阳虎, 独掌, 权柄, ...",古代史,"[(0, 0.99040365)]"
1,"[秦始皇, 统一, 六国后, 创制, 一套, 御玺, 任命, 国家, 官员, 封印, 皇帝,...",古代史,"[(0, 0.9924977)]"
2,"[北宋, 加强, 中央集权, 主要, 措施, 主要, 将领, 兵权, 收归, 中央, 派, ...",古代史,"[(0, 0.9890275)]"
3,"[商朝人, 崇信, 鬼神, 占卜, 祭祀, 神灵, 沟通, 手段, 负责, 通神, 事务, ...",古代史,"[(0, 0.72350544), (2, 0.27250484)]"
4,"[公元, 年, 北宋, 政府, 江淮地区, 设置, 包括, 盐业, 管理, 控制, 茶叶, ...",古代史,"[(0, 0.7180145), (2, 0.27803382)]"


In [11]:
geology = train_LDA(roots['geology'])
geology.head()

              precision    recall  f1-score   support

       人口与城市       0.49      0.85      0.62      1570
     区域可持续发展       0.00      0.02      0.00       130
       地球与地图       0.11      0.29      0.15       431
      宇宙中的地球       0.88      0.44      0.59      3716
   生产活动与地域联系       0.05      0.03      0.04      1340

    accuracy                           0.44      7187
   macro avg       0.31      0.32      0.28      7187
weighted avg       0.58      0.44      0.46      7187



Unnamed: 0,item,label,prediction
0,"[太阳系, 中, 地球, 行星, 重要, 区别, 存在, 生命, 物质, 质量, 最小, 平...",宇宙中的地球,"[(0, 0.74310714), (1, 0.24203418)]"
1,"[时区, 叙述, 不, 全球, 共, 分成, 时区, 北京, 时间, 不是, 北京, 地方,...",宇宙中的地球,"[(2, 0.9884135)]"
2,"[太阳活动, 叙述, 黑子, 实际上, 不, 黑, 温度, 太阳, 表面, 地方, 高, 耀...",宇宙中的地球,"[(1, 0.9887558)]"
3,"[各图, 中, 阴影, 部分, 代表, 黑夜, 代表, 晨线, 晨昏, 线, 定义, 地球,...",宇宙中的地球,"[(4, 0.98192185)]"
4,"[年, 地球, 将会, 遭遇, 强烈, 超级, 太阳风暴, 破坏力, 远远, 超过, 卡特里...",宇宙中的地球,"[(0, 0.0813264), (1, 0.7201778), (4, 0.1939864)]"


In [12]:
politics = train_LDA(roots['politics'])
politics.head()

              precision    recall  f1-score   support

   公民道德与伦理常识       0.68      0.29      0.40      1760
        时事政治       0.07      0.52      0.13        67
    生活中的法律常识       0.21      0.35      0.26       170
      科学思维常识       0.25      0.51      0.33       260
    科学社会主义常识       0.53      0.51      0.52       573
       经济学常识       0.29      0.41      0.34       566

    accuracy                           0.37      3396
   macro avg       0.34      0.43      0.33      3396
weighted avg       0.52      0.37      0.39      3396



Unnamed: 0,item,label,prediction
0,"[年, 政府, 工作, 报告, 提出, 缩小, 收入, 分配, 差距, 使, 发展, 成果,...",公民道德与伦理常识,"[(1, 0.98935336)]"
1,"[做, 蛋糕, 分, 蛋糕, 经济社会, 面临, 最, 基本, 问题, 既要, 蛋糕, 做,...",公民道德与伦理常识,"[(5, 0.9843866)]"
2,"[最近, 常有, 手机用户, 收到, 老朋友, 名义, 发来, 短信, 短信, 极易, 引诱...",公民道德与伦理常识,"[(2, 0.9902019)]"
3,"[家庭, 人生, 第一, 课堂, 父母, 子女, 第一任, 教师, 家庭教育, 子女, 健康...",公民道德与伦理常识,"[(0, 0.9766222)]"
4,"[社会主义, 市场经济, 指, 市场, 国家, 下, 资源配置, 作用, 经济, 社会主义,...",公民道德与伦理常识,"[(4, 0.97597456)]"


In [13]:
biology = train_LDA(roots['biology'])
biology.head()

              precision    recall  f1-score   support

       分子与细胞       0.57      0.37      0.45      2980
    现代生物技术专题       0.31      0.58      0.41      1000
      生物技术实践       0.17      0.14      0.15      1770
     生物科学与社会       0.74      0.85      0.79      3900
       稳态与环境       0.70      0.56      0.62      3570
       遗传与进化       0.01      0.02      0.01      1040

    accuracy                           0.51     14260
   macro avg       0.42      0.42      0.41     14260
weighted avg       0.54      0.51      0.52     14260



Unnamed: 0,item,label,prediction
0,"[细胞, 内, 含量, 最多, 有机, 化合物, 化合物, 分别, 蛋白质, 水, 蛋白质,...",分子与细胞,"[(0, 0.21096516), (1, 0.33608967), (4, 0.44554..."
1,"[下图, 生物膜, 流动, 镶嵌, 模型, 物质, 跨膜, 运输, 示意图, 离子通道, 一...",分子与细胞,"[(0, 0.087426126), (3, 0.2820882), (4, 0.62777..."
2,"[多肽, 有个, 氨基酸, 天冬氨酸, 分别, 位于, 第位, 如图所示, 肽酶, 专门, ...",分子与细胞,"[(0, 0.19144966), (2, 0.79875475)]"
3,"[具有, 细胞, 结构, 没有, 核膜, 一组, 生物, 病毒, 乳酸菌, 细菌, 念珠, ...",分子与细胞,"[(3, 0.958468), (4, 0.035375662)]"
4,"[实验, 中, 同一, 显微镜, 观察, 同一, 装片次, 得到, 清晰, 四个, 物像, ...",分子与细胞,"[(3, 0.9949663)]"


## FastText

In [14]:
class Network(nn.Module):
    
    def __init__(self, embedding_size, word_size, class_num, pad_token):
        
        super(Network, self).__init__()
        self.embedding = nn.Embedding(word_size, embedding_size, pad_token)
        self.fc1 = nn.Linear(embedding_size, class_num)
        self.output = nn.LogSoftmax(dim=-1)
        
    def forward(self, sentences):
        
        embedded = self.embedding(sentences)
        with torch.no_grad():
            # number of effective words (remove <PAD>)
            word_count = (embedded.pow(2).sum(dim=-1)>0).sum(dim=-1).view(-1, 1).float()
        embedded = embedded.sum(dim = 1) / word_count
        logits = self.output(self.fc1(embedded))
        
        return logits

In [15]:
def sentence_proc(sentence, max_len, word2id):
    
    if len(sentence) > max_len:
        sentence = sentence[:max_len]
    else:
        sentence += ['<PAD>'] * (max_len - len(sentence))
        
    sentence = [word2id.get(word, word2id['<OOV>']) for word in sentence]
    return sentence

In [16]:
def filter_pad_words(texts, max_feature):
    
    word_list = [word for sentence in texts for word in sentence]
    counter = Counter(word_list)
    counter = [(word, count) for word, count in counter.items()]
    counter.sort(key = lambda x : x[1], reverse = True)
    
    valid_words = [word for word, _ in counter[:max_feature]]
    word2id = dict(zip(valid_words, range(1, len(valid_words) + 1) ) )
    word2id['<OOV>'] = 0
    word2id['<PAD>'] = len(word2id)
    
    lens = [len(sentence) for sentence in texts]
    max_len = int(np.mean(lens) + 2 * np.std(lens))
    
    texts = [sentence_proc(sentence, max_len, word2id) for sentence in texts]
    
    return texts, word2id

In [17]:
def train_FastText(subject, NGramRange=1, max_feature=10000, embedding_size = 300, epoch = 20):
    
    print('Reading Data')
    root = roots[subject]
    dataset = build_dataset(root)
    num_topics = len(dataset['label'].unique())
    dataset['item'] = dataset['item']
    common_texts=dataset['item'].tolist()
    
    print('Cleaning Data')
    common_texts, word2id = filter_pad_words(common_texts, max_feature)
    
    FastText = Network(embedding_size, len(word2id), num_topics, len(word2id)-1).to(device)
    optimizer = optim.Adam(FastText.parameters(), 0.001)
    
    print('Creating training/testing set')
    label2id = dict(zip(dataset['label'].unique(), range(num_topics)))
    id2label = dict(zip(label2id.values(), label2id.keys()))
    X = np.array(common_texts)
    y = np.array([label2id[label] for label in dataset['label']]).reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.2, 
                                                        random_state = 101)
    
    X_train = torch.tensor(X_train).long()
    y_train = torch.tensor(y_train).long()
    X_test = torch.tensor(X_test).long()
    y_test = torch.tensor(y_test).long()
    train = TensorDataset(X_train, y_train)
    test = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train, 64, True)
    test_loader = DataLoader(test, 64, False)
    
    print('Training\n')
    criterion = nn.NLLLoss()
    for i in range(1, epoch + 1):
        
        log = []
        
        for X_sample, y_sample in iter(train_loader):
            
            X_sample = X_sample.to(device)
            y_sample = y_sample.view(-1).to(device)
            logits = FastText(X_sample)
            loss = criterion(logits, y_sample)
            log.append(loss.item())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print('Epoch {}. Average loss {:.4f}'.format(i, np.mean(log)))
        
    print('\nTesting\n')
    predictions = []
    with torch.no_grad():
        
        for X_sample, _ in iter(test_loader):
            
            X_sample = X_sample.to(device)
            logits = FastText(X_sample)
            _, index = logits.topk(1, 1)
            index = index.view(-1).numpy().tolist()
            predictions += index
    
    y_test = y_test.reshape(-1).tolist()
    y_test = [id2label[ind] for ind in y_test]
    predictions = [id2label[ind] for ind in predictions]
    
    print('\nTest result for {} :'.format(subject))
    print(classification_report(y_test, predictions))
    
    return FastText

In [18]:
_ = train_FastText('history')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 0.9701
Epoch 2. Average loss 0.7412
Epoch 3. Average loss 0.5872
Epoch 4. Average loss 0.4968
Epoch 5. Average loss 0.4409
Epoch 6. Average loss 0.4036
Epoch 7. Average loss 0.3766
Epoch 8. Average loss 0.3633
Epoch 9. Average loss 0.3432
Epoch 10. Average loss 0.3248
Epoch 11. Average loss 0.3143
Epoch 12. Average loss 0.3021
Epoch 13. Average loss 0.2947
Epoch 14. Average loss 0.2876
Epoch 15. Average loss 0.2805
Epoch 16. Average loss 0.2750
Epoch 17. Average loss 0.2688
Epoch 18. Average loss 0.2662
Epoch 19. Average loss 0.2608
Epoch 20. Average loss 0.2540

Testing


Test result for history :
              precision    recall  f1-score   support

         古代史       0.90      0.81      0.85       203
         现代史       0.68      0.70      0.69       464
         近代史       0.61      0.62      0.61       327

    accuracy                           0.70       994
   macro avg       0.73      0.71

In [19]:
_ = train_FastText('geology')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 0.9883
Epoch 2. Average loss 0.4322
Epoch 3. Average loss 0.2613
Epoch 4. Average loss 0.1841
Epoch 5. Average loss 0.1413
Epoch 6. Average loss 0.1135
Epoch 7. Average loss 0.0938
Epoch 8. Average loss 0.0791
Epoch 9. Average loss 0.0678
Epoch 10. Average loss 0.0588
Epoch 11. Average loss 0.0515
Epoch 12. Average loss 0.0455
Epoch 13. Average loss 0.0407
Epoch 14. Average loss 0.0365
Epoch 15. Average loss 0.0331
Epoch 16. Average loss 0.0300
Epoch 17. Average loss 0.0277
Epoch 18. Average loss 0.0258
Epoch 19. Average loss 0.0240
Epoch 20. Average loss 0.0226

Testing


Test result for geology :
              precision    recall  f1-score   support

       人口与城市       0.94      0.94      0.94       308
     区域可持续发展       0.89      0.65      0.76        26
       地球与地图       0.91      0.81      0.86        93
      宇宙中的地球       0.97      0.99      0.98       726
   生产活动与地域联系       0.90      0.93 

In [20]:
_ = train_FastText('politics')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.5284
Epoch 2. Average loss 1.0353
Epoch 3. Average loss 0.7145
Epoch 4. Average loss 0.5094
Epoch 5. Average loss 0.3839
Epoch 6. Average loss 0.2990
Epoch 7. Average loss 0.2366
Epoch 8. Average loss 0.1927
Epoch 9. Average loss 0.1587
Epoch 10. Average loss 0.1322
Epoch 11. Average loss 0.1114
Epoch 12. Average loss 0.0948
Epoch 13. Average loss 0.0805
Epoch 14. Average loss 0.0695
Epoch 15. Average loss 0.0603
Epoch 16. Average loss 0.0525
Epoch 17. Average loss 0.0459
Epoch 18. Average loss 0.0399
Epoch 19. Average loss 0.0353
Epoch 20. Average loss 0.0313

Testing


Test result for politics :
              precision    recall  f1-score   support

   公民道德与伦理常识       0.95      0.98      0.96       357
        时事政治       1.00      0.89      0.94         9
    生活中的法律常识       1.00      0.81      0.90        37
      科学思维常识       0.94      0.90      0.92        51
    科学社会主义常识       0.93      0.97

In [21]:
_ = train_FastText('biology')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.0818
Epoch 2. Average loss 0.5393
Epoch 3. Average loss 0.4493
Epoch 4. Average loss 0.4130
Epoch 5. Average loss 0.3893
Epoch 6. Average loss 0.3726
Epoch 7. Average loss 0.3609
Epoch 8. Average loss 0.3479
Epoch 9. Average loss 0.3395
Epoch 10. Average loss 0.3296
Epoch 11. Average loss 0.3208
Epoch 12. Average loss 0.3142
Epoch 13. Average loss 0.3072
Epoch 14. Average loss 0.3022
Epoch 15. Average loss 0.2969
Epoch 16. Average loss 0.2920
Epoch 17. Average loss 0.2867
Epoch 18. Average loss 0.2832
Epoch 19. Average loss 0.2794
Epoch 20. Average loss 0.2761

Testing


Test result for biology :
              precision    recall  f1-score   support

       分子与细胞       0.90      0.90      0.90       584
    现代生物技术专题       0.16      0.13      0.14       207
      生物技术实践       0.45      0.49      0.47       332
     生物科学与社会       0.72      0.77      0.75       796
       稳态与环境       0.95      0.94 