In this notebook we try to classify the questions within each subject. So there would be four subjects in total, different subjects may contain different number of classes. 

In [1]:
import os
import re
import jieba
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
jieba.load_userdict('./stopwords/Special_words.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/09/k_9rj22d0dgbjd8832nhvlbh0000gn/T/jieba.cache
Loading model cost 0.562 seconds.
Prefix dict has been built succesfully.


In [3]:
roots = {'history' : './data/百度题库/高中_历史/origin/', 
         'geology' : './data/百度题库/高中_地理/origin/',
         'politics' : './data/百度题库/高中_政治/origin/',
         'biology' : './data/百度题库/高中_生物/origin/'}

In [4]:
def read_files(root):
    '''
    This function reads in all csv files lies directly under the root directory
    
    Returns the file directories as well as class names (file names)
    '''
    file_names = os.listdir(root)
    file_names = [name for name in file_names if name.endswith('csv')]
    classes = [name.split('.')[0] for name in file_names]
    file_names = [root + name for name in file_names]
    datasets = [pd.read_csv(name) for name in file_names]
    return datasets, classes

In [5]:
def load_stop_words(path):
    file = open(path, 'r', encoding='utf-8')
    stopwords = file.readlines()
    stopwords = [word.strip() for word in stopwords]
    return stopwords

stopwords = load_stop_words('./stopwords/stopwords2.txt')

In [6]:
remove = "[a-zA-Z0-9]|[\s+\-\|\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】《》“”！，。？、~@#￥%……&*（）]+|题目|排除|选项|知识点"
def clean_sentence(line):
    '''
    This function cleans the context
    '''
    line = re.sub(remove, '', line)
    tokens = jieba.cut(line, cut_all=False)
    tokens = [token for token in tokens if token not in stopwords]
    return " ".join(tokens)

In [7]:
def clean_line(line):
    part1, part2 = line.split('题型', 1) # part 1 is 题目
    part2, part3 = part2.split('解析', 1) # part 2 is abanddoned
    part3 = part3.split('解析')[1]
    try:
        part3, part4 = part3.split('知识点', 1) # part 3 is 解析, part 4 is 知识点
    except ValueError:
        part4 = ''
    result = []
    for line in [part1, part3, part4]:
        result.append(clean_sentence(line))
    return result

In [8]:
def build_dataset(root):
    
    datasets, classes = read_files(root)
    
    for dataset, label in zip(datasets, classes):
        dataset['item'] = dataset['item'].apply(lambda x : clean_line(x))
        dataset['question'] = dataset['item'].apply(lambda x : x[0]).apply(lambda x : x.split())
        dataset['solution'] = dataset['item'].apply(lambda x : x[1]).apply(lambda x : x.split())
        dataset['keypoints'] = dataset['item'].apply(lambda x : x[2]).apply(lambda x : x.split())
        dataset['item'] = dataset['item'].apply(lambda x : ' '.join(x)).apply(lambda x : x.split())
        dataset['label'] = label
    
    dataset = pd.concat(datasets, ignore_index = True)
    dataset = dataset[['item', 'question', 'solution', 'keypoints', 'label']]
        
    return dataset

In [9]:
def train_test_report(name, clf, X_train, y_train, X_test, y_test):
    print('Training {}'.format(name))
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print('{} classification report: \n'.format(name))
    print(classification_report(y_test, predictions))

In [10]:
def train_models(root, vectorizer):
    
    dataset = build_dataset(root)
    
    corpus = dataset['item'].apply(lambda x : ' '.join(x))
    
    X = vectorizer.fit_transform(corpus)
    
    X_train, X_test, y_train, y_test = train_test_split(X.toarray(), 
                                                        dataset['label'].values, 
                                                        test_size = 0.2, 
                                                        random_state = 101)
    
    train_test_report('Guassian Naive Bayes', GaussianNB(), X_train, y_train, X_test, y_test)
    
    train_test_report('Multinomial Naive Bayes', MultinomialNB(), X_train, y_train, X_test, y_test)
    
    train_test_report('Complement Naive Bayes', ComplementNB(), X_train, y_train, X_test, y_test)
    
    train_test_report('Bernoulli Naive Bayes', BernoulliNB(), X_train, y_train, X_test, y_test)

In [11]:
vectorizer = TfidfVectorizer(max_features = 1000, min_df = 3)
train_models(roots['history'], vectorizer)

Training Guassian Naive Bayes
Guassian Naive Bayes classification report: 

              precision    recall  f1-score   support

         古代史       0.65      0.95      0.77       203
         现代史       0.86      0.40      0.55       464
         近代史       0.60      0.89      0.72       327

    accuracy                           0.67       994
   macro avg       0.71      0.75      0.68       994
weighted avg       0.73      0.67      0.65       994

Training Multinomial Naive Bayes
Multinomial Naive Bayes classification report: 

              precision    recall  f1-score   support

         古代史       0.90      0.83      0.87       203
         现代史       0.80      0.77      0.79       464
         近代史       0.73      0.80      0.77       327

    accuracy                           0.79       994
   macro avg       0.81      0.80      0.81       994
weighted avg       0.80      0.79      0.80       994

Training Complement Naive Bayes
Complement Naive Bayes classification report: 



In [12]:
vectorizer = TfidfVectorizer(max_features = 1000, min_df = 3)
train_models(roots['geology'], vectorizer)

Training Guassian Naive Bayes
Guassian Naive Bayes classification report: 

              precision    recall  f1-score   support

       人口与城市       0.85      0.86      0.86       308
     区域可持续发展       0.37      0.42      0.39        26
       地球与地图       0.46      0.80      0.58        93
      宇宙中的地球       0.97      0.89      0.93       726
   生产活动与地域联系       0.87      0.82      0.84       285

    accuracy                           0.86      1438
   macro avg       0.70      0.76      0.72      1438
weighted avg       0.88      0.86      0.86      1438

Training Multinomial Naive Bayes
Multinomial Naive Bayes classification report: 

              precision    recall  f1-score   support

       人口与城市       0.89      0.94      0.91       308
     区域可持续发展       1.00      0.31      0.47        26
       地球与地图       0.96      0.78      0.86        93
      宇宙中的地球       0.97      0.99      0.98       726
   生产活动与地域联系       0.90      0.93      0.91       285

    accuracy               

In [13]:
vectorizer = TfidfVectorizer(max_features = 1000, min_df = 3)
train_models(roots['politics'], vectorizer)

Training Guassian Naive Bayes
Guassian Naive Bayes classification report: 

              precision    recall  f1-score   support

   公民道德与伦理常识       0.89      0.89      0.89       357
        时事政治       1.00      0.56      0.71         9
    生活中的法律常识       0.92      0.65      0.76        37
      科学思维常识       0.91      0.63      0.74        51
    科学社会主义常识       0.76      0.89      0.82       103
       经济学常识       0.82      0.90      0.86       123

    accuracy                           0.86       680
   macro avg       0.88      0.75      0.80       680
weighted avg       0.86      0.86      0.85       680

Training Multinomial Naive Bayes
Multinomial Naive Bayes classification report: 

              precision    recall  f1-score   support

   公民道德与伦理常识       0.91      0.99      0.95       357
        时事政治       1.00      0.78      0.88         9
    生活中的法律常识       0.97      0.78      0.87        37
      科学思维常识       1.00      0.84      0.91        51
    科学社会主义常识       0.94     

In [14]:
vectorizer = TfidfVectorizer(max_features = 1000, min_df = 3)
train_models(roots['biology'], vectorizer)

Training Guassian Naive Bayes
Guassian Naive Bayes classification report: 

              precision    recall  f1-score   support

       分子与细胞       0.86      0.88      0.87       584
    现代生物技术专题       0.40      0.85      0.55       207
      生物技术实践       0.64      0.36      0.46       332
     生物科学与社会       0.70      0.05      0.09       796
       稳态与环境       0.95      0.90      0.93       733
       遗传与进化       0.21      0.92      0.34       200

    accuracy                           0.59      2852
   macro avg       0.63      0.66      0.54      2852
weighted avg       0.73      0.59      0.56      2852

Training Multinomial Naive Bayes
Multinomial Naive Bayes classification report: 

              precision    recall  f1-score   support

       分子与细胞       0.86      0.88      0.87       584
    现代生物技术专题       0.42      0.56      0.48       207
      生物技术实践       0.60      0.57      0.58       332
     生物科学与社会       0.79      0.86      0.83       796
       稳态与环境       0.95     