In [1]:
import os
import re
import jieba
import pandas as pd
import numpy as np

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import TensorDataset

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
roots = {'history' : './data/百度题库/高中_历史/origin/', 
         'geology' : './data/百度题库/高中_地理/origin/',
         'politics' : './data/百度题库/高中_政治/origin/',
         'biology' : './data/百度题库/高中_生物/origin/'}

In [3]:
def load_stop_words(path):
    file = open(path, 'r', encoding='utf-8')
    stopwords = file.readlines()
    stopwords = [word.strip() for word in stopwords]
    return stopwords

stopwords = load_stop_words('./stopwords/stopwords2.txt')

In [4]:
def read_files(root):
    '''
    This function reads in all csv files lies directly under the root directory
    
    Returns the file directories as well as class names (file names)
    '''
    file_names = os.listdir(root)
    file_names = [name for name in file_names if name.endswith('csv')]
    classes = [name.split('.')[0] for name in file_names]
    file_names = [root + name for name in file_names]
    datasets = [pd.read_csv(name) for name in file_names]
    return datasets, classes

In [5]:
def clean_line(line):
    '''
    This function cleans the context
    '''
    line = re.sub(
            "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目", '',line)
    tokens = jieba.cut(line, cut_all=False)
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

In [6]:
def build_dataset(root):
    
    datasets, classes = read_files(root)
    
    for dataset, label in zip(datasets, classes):
        dataset['item'] = dataset['item'].apply(lambda x:clean_line(x))
        dataset['label'] = label
    
    dataset = pd.concat(datasets, ignore_index = True)
    dataset = dataset[['item', 'label']]
        
    return dataset

## LDA

The topics in LDA are numbers but the label are strings. The correspondence might change among different runs.

In [7]:
def train_LDA(root):
    
    dataset = build_dataset(root)
    num_topics = len(dataset['label'].unique())
    common_texts=dataset['item'].apply(lambda x:x.split()).tolist()
    
    dictionary = Dictionary(common_texts)
    corpus = [dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(corpus,id2word=dictionary, num_topics=num_topics)
    
    predictions = [lda.get_document_topics(text) for text in corpus]
    
    dataset['prediction'] = predictions
    return dataset

In [8]:
history = train_LDA(roots['history'])
history.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/09/k_9rj22d0dgbjd8832nhvlbh0000gn/T/jieba.cache
Loading model cost 0.540 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,item,label,prediction
0,左传 记载 春秋 后期 鲁国 大夫 季孙氏 家臣 阳虎 独掌 权柄 后 标榜 鲁国 国君 整...,古代史,"[(0, 0.6487391), (1, 0.34762985)]"
1,秦始皇 统一 六国后 创制 一套 御玺 任命 国家 官员 封印 皇帝 之玺 任命 四夷 官员...,古代史,"[(0, 0.11025148), (1, 0.8868649)]"
2,北宋 加强 中央集权 主要 措施 主要 将领 兵权 收归 中央 派 文官 担任 地方 长官 ...,古代史,"[(0, 0.46865705), (1, 0.5271264)]"
3,商朝人 崇信 鬼神 占卜 祭祀 神灵 沟通 手段 负责 通神 事务 商王 巫师 往往 出身 ...,古代史,"[(0, 0.9933193)]"
4,公元 年 北宋 政府 江淮地区 设置 包括 盐业 管理 控制 茶叶 销售 专卖 主要职责 转...,古代史,"[(0, 0.21858494), (1, 0.77807844)]"


In [9]:
geology = train_LDA(roots['geology'])
geology.head()

Unnamed: 0,item,label,prediction
0,太阳系 中 地球 行星 重要 区别 存在 生命 物质 质量 最小 平均 密度 最小 日地 距...,宇宙中的地球,"[(0, 0.67531663), (3, 0.31273198)]"
1,时区 叙述 不 全球 共 分成 时区 北京 时间 不是 北京 地方 时 北京 所在 时区 区...,宇宙中的地球,"[(2, 0.51076627), (3, 0.4830745)]"
2,下列 太阳活动 叙述 黑子 实际上 不 黑 温度 太阳 表面 地方 高 耀斑 大小 太阳活动...,宇宙中的地球,"[(0, 0.036548425), (3, 0.3594231), (4, 0.59948..."
3,下列 各图 中 阴影 部分 代表 黑夜 代表 晨线 简单 使用 纠错 复制 空间 加入 选题...,宇宙中的地球,"[(3, 0.98535085)]"
4,年 地球 将会 遭遇 强烈 超级 太阳风暴 破坏力 远远 超过 卡特里娜 飓风 地球 上 几...,宇宙中的地球,"[(3, 0.18131445), (4, 0.81293)]"


In [10]:
politics = train_LDA(roots['politics'])
politics.head()

Unnamed: 0,item,label,prediction
0,年 政府 工作 报告 提出 缩小 收入 分配 差距 使 发展 成果 更 更 公平 惠及 全体...,公民道德与伦理常识,"[(0, 0.9904346)]"
1,做 蛋糕 分 蛋糕 经济社会 面临 最 基本 问题 既要 蛋糕 做 大 蛋糕 分 好 蛋糕 ...,公民道德与伦理常识,"[(2, 0.98662996)]"
2,材料 最近 常有 手机用户 收到 老朋友 名义 发来 短信 短信 极易 引诱 收信人 回复 ...,公民道德与伦理常识,"[(0, 0.2021024), (4, 0.7944467)]"
3,家庭 人生 第一 课堂 父母 子女 第一任 教师 家庭教育 子女 健康成长 终生 都 不可 ...,公民道德与伦理常识,"[(1, 0.0713294), (2, 0.9133009)]"
4,社会主义 市场经济 指 市场 国家 下 资源配置 作用 经济 社会主义 市场经济 有何 基本...,公民道德与伦理常识,"[(2, 0.98864925)]"


In [11]:
biology = train_LDA(roots['biology'])
biology.head()

Unnamed: 0,item,label,prediction
0,细胞 内 含量 最多 有机 化合物 化合物 分别 蛋白质 水 蛋白质 无机盐 核酸 水脂质 ...,分子与细胞,"[(1, 0.98983645)]"
1,下图 生物膜 流动 镶嵌 模型 物质 跨膜 运输 示意图 离子通道 一种 通道 蛋白 通道 ...,分子与细胞,"[(1, 0.43832564), (2, 0.1061929), (3, 0.218384..."
2,多肽 有个 氨基酸 天冬氨酸 分别 位于 第位 如图所示 肽酶 专门 作用 天冬氨酸 羧基端...,分子与细胞,"[(1, 0.9416803), (2, 0.055218704)]"
3,具有 细胞 结构 没有 核膜 一组 生物 病毒 乳酸菌 细菌 念珠 藻 变形虫 草履虫 蓝藻...,分子与细胞,"[(5, 0.993133)]"
4,实验 中 同一 显微镜 观察 同一 装片次 得到 清晰 四个 物像 如下 图 实验 说法 换...,分子与细胞,"[(1, 0.11662456), (4, 0.7380982), (5, 0.142535..."


## FastText

In [12]:
class Network(nn.Module):
    
    def __init__(self, embedding_size, word_size, class_num, pad_token):
        
        super(Network, self).__init__()
        self.embedding = nn.Embedding(word_size, embedding_size, pad_token)
        self.fc1 = nn.Linear(embedding_size, class_num)
        self.output = nn.LogSoftmax(dim=-1)
        
    def forward(self, sentences):
        
        embedded = self.embedding(sentences)
        with torch.no_grad():
            # number of effective words (remove <PAD>)
            word_count = (embedded.pow(2).sum(dim=-1)>0).sum(dim=-1).view(-1, 1).float()
        embedded = embedded.sum(dim = 1) / word_count
        logits = self.output(self.fc1(embedded))
        
        return logits

In [13]:
def sentence_proc(sentence, max_len, word2id):
    
    if len(sentence) > max_len:
        sentence = sentence[:max_len]
    else:
        sentence += ['<PAD>'] * (max_len - len(sentence))
        
    sentence = [word2id.get(word, word2id['<OOV>']) for word in sentence]
    return sentence

In [14]:
def filter_pad_words(texts, max_feature):
    
    word_list = [word for sentence in texts for word in sentence]
    counter = Counter(word_list)
    counter = [(word, count) for word, count in counter.items()]
    counter.sort(key = lambda x : x[1], reverse = True)
    
    valid_words = [word for word, _ in counter[:max_feature]]
    word2id = dict(zip(valid_words, range(1, len(valid_words) + 1) ) )
    word2id['<OOV>'] = 0
    word2id['<PAD>'] = len(word2id)
    
    lens = [len(sentence) for sentence in texts]
    max_len = int(np.mean(lens) + 2 * np.std(lens))
    
    texts = [sentence_proc(sentence, max_len, word2id) for sentence in texts]
    
    return texts, word2id

In [15]:
def train_FastText(subject, NGramRange=1, max_feature=10000, embedding_size = 300, epoch = 20):
    
    print('Reading Data')
    root = roots[subject]
    dataset = build_dataset(root)
    num_topics = len(dataset['label'].unique())
    dataset['item'] = dataset['item'].apply(lambda x:x.split())
    common_texts=dataset['item'].tolist()
    
    print('Cleaning Data')
    common_texts, word2id = filter_pad_words(common_texts, max_feature)
    
    FastText = Network(embedding_size, len(word2id), num_topics, len(word2id)-1).to(device)
    optimizer = optim.Adam(FastText.parameters(), 0.001)
    
    print('Creating training/testing set')
    label2id = dict(zip(dataset['label'].unique(), range(num_topics)))
    id2label = dict(zip(label2id.values(), label2id.keys()))
    X = np.array(common_texts)
    y = np.array([label2id[label] for label in dataset['label']]).reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size = 0.2, 
                                                        random_state = 101)
    
    X_train = torch.tensor(X_train).long()
    y_train = torch.tensor(y_train).long()
    X_test = torch.tensor(X_test).long()
    y_test = torch.tensor(y_test).long()
    train = TensorDataset(X_train, y_train)
    test = TensorDataset(X_test, y_test)
    train_loader = DataLoader(train, 64, True)
    test_loader = DataLoader(test, 64, False)
    
    print('Training\n')
    criterion = nn.NLLLoss()
    for i in range(1, epoch + 1):
        
        log = []
        
        for X_sample, y_sample in iter(train_loader):
            
            X_sample = X_sample.to(device)
            y_sample = y_sample.view(-1).to(device)
            logits = FastText(X_sample)
            loss = criterion(logits, y_sample)
            log.append(loss.item())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print('Epoch {}. Average loss {:.4f}'.format(i, np.mean(log)))
        
    print('\nTesting\n')
    predictions = []
    with torch.no_grad():
        
        for X_sample, _ in iter(test_loader):
            
            X_sample = X_sample.to(device)
            logits = FastText(X_sample)
            _, index = logits.topk(1, 1)
            index = index.view(-1).numpy().tolist()
            predictions += index
    
    y_test = y_test.reshape(-1).tolist()
    y_test = [id2label[ind] for ind in y_test]
    predictions = [id2label[ind] for ind in predictions]
    
    print('\nTest result for {} :'.format(subject))
    print(classification_report(y_test, predictions))
    
    return FastText

In [17]:
_ = train_FastText('history')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 0.9733
Epoch 2. Average loss 0.7595
Epoch 3. Average loss 0.6106
Epoch 4. Average loss 0.5116
Epoch 5. Average loss 0.4575
Epoch 6. Average loss 0.4214
Epoch 7. Average loss 0.3911
Epoch 8. Average loss 0.3651
Epoch 9. Average loss 0.3506
Epoch 10. Average loss 0.3341
Epoch 11. Average loss 0.3223
Epoch 12. Average loss 0.3088
Epoch 13. Average loss 0.3041
Epoch 14. Average loss 0.2954
Epoch 15. Average loss 0.2845
Epoch 16. Average loss 0.2770
Epoch 17. Average loss 0.2732
Epoch 18. Average loss 0.2678
Epoch 19. Average loss 0.2618
Epoch 20. Average loss 0.2548

Testing


Test result for history :
              precision    recall  f1-score   support

         古代史       0.89      0.83      0.86       203
         现代史       0.69      0.71      0.70       464
         近代史       0.62      0.63      0.62       327

   micro avg       0.71      0.71      0.71       994
   macro avg       0.73      0.72

In [18]:
_ = train_FastText('geology')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.0517
Epoch 2. Average loss 0.4860
Epoch 3. Average loss 0.2941
Epoch 4. Average loss 0.2042
Epoch 5. Average loss 0.1553
Epoch 6. Average loss 0.1236
Epoch 7. Average loss 0.1019
Epoch 8. Average loss 0.0860
Epoch 9. Average loss 0.0737
Epoch 10. Average loss 0.0641
Epoch 11. Average loss 0.0562
Epoch 12. Average loss 0.0497
Epoch 13. Average loss 0.0444
Epoch 14. Average loss 0.0397
Epoch 15. Average loss 0.0358
Epoch 16. Average loss 0.0327
Epoch 17. Average loss 0.0300
Epoch 18. Average loss 0.0277
Epoch 19. Average loss 0.0256
Epoch 20. Average loss 0.0240

Testing


Test result for geology :
              precision    recall  f1-score   support

       人口与城市       0.94      0.93      0.93       308
     区域可持续发展       1.00      0.69      0.82        26
       地球与地图       0.91      0.85      0.88        93
      宇宙中的地球       0.98      0.99      0.98       726
   生产活动与地域联系       0.90      0.93 

In [19]:
_ = train_FastText('politics')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.4524
Epoch 2. Average loss 0.9920
Epoch 3. Average loss 0.7033
Epoch 4. Average loss 0.5152
Epoch 5. Average loss 0.3870
Epoch 6. Average loss 0.3002
Epoch 7. Average loss 0.2401
Epoch 8. Average loss 0.1921
Epoch 9. Average loss 0.1576
Epoch 10. Average loss 0.1294
Epoch 11. Average loss 0.1080
Epoch 12. Average loss 0.0915
Epoch 13. Average loss 0.0789
Epoch 14. Average loss 0.0674
Epoch 15. Average loss 0.0583
Epoch 16. Average loss 0.0500
Epoch 17. Average loss 0.0443
Epoch 18. Average loss 0.0388
Epoch 19. Average loss 0.0346
Epoch 20. Average loss 0.0300

Testing


Test result for politics :
              precision    recall  f1-score   support

   公民道德与伦理常识       0.95      0.99      0.97       357
        时事政治       1.00      0.89      0.94         9
    生活中的法律常识       1.00      0.84      0.91        37
      科学思维常识       0.94      0.88      0.91        51
    科学社会主义常识       0.94      0.98

In [20]:
_ = train_FastText('biology')

Reading Data
Cleaning Data
Creating training/testing set
Training

Epoch 1. Average loss 1.1070
Epoch 2. Average loss 0.5495
Epoch 3. Average loss 0.4502
Epoch 4. Average loss 0.4125
Epoch 5. Average loss 0.3886
Epoch 6. Average loss 0.3725
Epoch 7. Average loss 0.3593
Epoch 8. Average loss 0.3485
Epoch 9. Average loss 0.3384
Epoch 10. Average loss 0.3291
Epoch 11. Average loss 0.3217
Epoch 12. Average loss 0.3140
Epoch 13. Average loss 0.3065
Epoch 14. Average loss 0.3014
Epoch 15. Average loss 0.2967
Epoch 16. Average loss 0.2923
Epoch 17. Average loss 0.2876
Epoch 18. Average loss 0.2836
Epoch 19. Average loss 0.2789
Epoch 20. Average loss 0.2759

Testing


Test result for biology :
              precision    recall  f1-score   support

       分子与细胞       0.90      0.90      0.90       584
    现代生物技术专题       0.16      0.14      0.15       207
      生物技术实践       0.44      0.46      0.45       332
     生物科学与社会       0.73      0.79      0.76       796
       稳态与环境       0.96      0.95 