In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 读取数据

In [2]:
train_data = pd.read_csv('train.txt', sep='\t', header=None, names=['content', 'label'])
test_data = pd.read_csv('test.txt', sep='\t', header=None, names=['content', 'label'])

In [3]:
train_data.head()

Unnamed: 0,content,label
0,中华女子学院：本科层次仅1专业招男生,3
1,两天价网站背后重重迷雾：做个网站究竟要多少钱,4
2,东5环海棠公社230-290平2居准现房98折优惠,1
3,卡佩罗：告诉你德国脚生猛的原因 不希望英德战踢点球,7
4,82岁老太为学生做饭扫地44年获授港大荣誉院士,5


In [4]:
test_data.head()

Unnamed: 0,content,label
0,词汇阅读是关键 08年考研暑期英语复习全指南,3
1,中国人民公安大学2012年硕士研究生目录及书目,3
2,日本地震：金吉列关注在日学子系列报道,3
3,名师辅导：2012考研英语虚拟语气三种用法,3
4,自考经验谈：自考生毕业论文选题技巧,3


In [5]:
train_data.info()  # 180000 train samples in total

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   content  180000 non-null  object
 1   label    180000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.7+ MB


In [6]:
test_data.info()  # 10000 test samples in total

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  10000 non-null  object
 1   label    10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [7]:
train_data.label.unique()  # 10 classes 

array([3, 4, 1, 7, 5, 9, 8, 2, 6, 0], dtype=int64)

In [8]:
test_data.label.unique()

array([3, 9, 0, 8, 6, 1, 4, 5, 7, 2], dtype=int64)

In [9]:
train_data.label.value_counts()  # 18000 train samples each class

3    18000
4    18000
1    18000
7    18000
5    18000
9    18000
8    18000
2    18000
6    18000
0    18000
Name: label, dtype: int64

In [10]:
test_data.label.value_counts()  # 1000 test samples each class

3    1000
9    1000
0    1000
8    1000
6    1000
1    1000
4    1000
5    1000
7    1000
2    1000
Name: label, dtype: int64

In [11]:
train_data.content.shape, test_data.content.shape

((180000,), (10000,))

In [12]:
train_data.content[0]

'中华女子学院：本科层次仅1专业招男生'

# 中文分词

In [13]:
import jieba

train_seg = list(map(lambda s: jieba.lcut(s), train_data.content))
test_seg = list(map(lambda s: jieba.lcut(s), test_data.content))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yasho\AppData\Local\Temp\jieba.cache
Loading model cost 0.314 seconds.
Prefix dict has been built successfully.


In [14]:
len(train_seg), len(test_seg)

(180000, 10000)

In [15]:
train_seg[0], test_seg[0]

(['中华', '女子', '学院', '：', '本科', '层次', '仅', '1', '专业', '招', '男生'],
 ['词汇', '阅读', '是', '关键', ' ', '08', '年', '考研', '暑期', '英语', '复习', '全', '指南'])

In [16]:
def get_stop_words(stop_words_path):
    with open(stop_words_path, encoding='utf-8') as f:
        return [line.strip() for line in f]
baidu_stopwords = get_stop_words('./baidu_stopwords.txt')
cn_stopwords = get_stop_words('./cn_stopwords.txt')
hit_stopwords = get_stop_words('./hit_stopwords.txt')
scu_stopwords = get_stop_words('./scu_stopwords.txt')
stop_words = baidu_stopwords + cn_stopwords + hit_stopwords + scu_stopwords

In [17]:
print(stop_words)

['--', '?', '“', '”', '》', '－－', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', "a's", 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', "c'mon", 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containin

In [18]:
def drop_stopwords(line, stopwords):
    line_clean = []
    for word in line:
        if word not in stopwords:
            line_clean.append(word)
    return line_clean

In [19]:
x_train = list(map(lambda s: drop_stopwords(s,stop_words), train_seg))
x_test = list(map(lambda s: drop_stopwords(s, stop_words), test_seg))

In [20]:
x_train[0], x_test[0]

(['中华', '女子', '学院', '本科', '层次', '仅', '专业', '招', '男生'],
 ['词汇', '阅读', '关键', ' ', '08', '年', '考研', '暑期', '英语', '复习', '全', '指南'])

In [21]:
x_train = list(map(lambda s: ' '.join(s), x_train))
x_test = list(map(lambda s: ' '.join(s), x_test))

In [22]:
x_train[0], x_test[0]

('中华 女子 学院 本科 层次 仅 专业 招 男生', '词汇 阅读 关键   08 年 考研 暑期 英语 复习 全 指南')

# feature extraction

In [23]:
vectorizer = TfidfVectorizer().fit(x_train)
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)
print(vectorizer.get_feature_names)

<bound method CountVectorizer.get_feature_names of TfidfVectorizer()>


In [24]:
print(x_train.shape, x_test.shape)

(180000, 111809) (10000, 111809)


In [25]:
print(type(x_train))

<class 'scipy.sparse._csr.csr_matrix'>


# Modeling

In [26]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report

多项式朴素贝叶斯

In [27]:
%%time
mul_clf = MultinomialNB()
mul_clf.fit(x_train, train_data.label)

Wall time: 74.1 ms


MultinomialNB()

# performance

In [28]:
mul_clf.score(x_test, test_data.label)

0.892

In [29]:
y_pred = mul_clf.predict(x_test)
print(confusion_matrix(test_data.label, y_pred))

[[897  11  50   3   3  17   9   2   5   3]
 [ 30 880  26   8   1  18   9   2   8  18]
 [ 88  14 831   4  20   3  29   4   3   4]
 [  1   4   0 957   4   8  11   5   4   6]
 [ 14   5  50  13 818  27  24   5  27  17]
 [  2   7   3  27   9 900  19   4   4  25]
 [ 10   4  18  15  15  36 886   3   1  12]
 [  0   5   1   3   5  13  14 943   3  13]
 [  3   0   4   9  53  10   6   7 894  14]
 [  4   3   1   5   7  31   5  11  19 914]]


In [30]:
print(classification_report(test_data.label, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1000
           1       0.94      0.88      0.91      1000
           2       0.84      0.83      0.84      1000
           3       0.92      0.96      0.94      1000
           4       0.87      0.82      0.85      1000
           5       0.85      0.90      0.87      1000
           6       0.88      0.89      0.88      1000
           7       0.96      0.94      0.95      1000
           8       0.92      0.89      0.91      1000
           9       0.89      0.91      0.90      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



补集朴素贝叶斯

In [31]:
%%time
com_clf = ComplementNB()
com_clf.fit(x_train, train_data.label)

Wall time: 69.4 ms


ComplementNB()

In [32]:
com_clf.score(x_test, test_data.label)

0.8952

In [33]:
y_pred = com_clf.predict(x_test)
print(confusion_matrix(test_data.label, y_pred))

[[894  11  44   6   3  15  14   4   6   3]
 [ 18 898  18  10   4  20   8   5   7  12]
 [ 98  19 793   4  20   5  42  10   5   4]
 [  0   5   1 961   2   9   9   3   2   8]
 [ 12   6  31  18 825  29  27   7  31  14]
 [  2   7   3  33   7 896  20   3   6  23]
 [  9  10  12  13  13  48 875   5   5  10]
 [  0   3   0   1   3  10   6 963   4  10]
 [  0   2   8   4  26  10   4   8 924  14]
 [  3   3   1   7   5  20   2  14  22 923]]


In [34]:
print(classification_report(test_data.label, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1000
           1       0.93      0.90      0.91      1000
           2       0.87      0.79      0.83      1000
           3       0.91      0.96      0.93      1000
           4       0.91      0.82      0.86      1000
           5       0.84      0.90      0.87      1000
           6       0.87      0.88      0.87      1000
           7       0.94      0.96      0.95      1000
           8       0.91      0.92      0.92      1000
           9       0.90      0.92      0.91      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.89     10000
weighted avg       0.90      0.90      0.89     10000



逻辑回归

In [35]:
%%time
log_clf = LogisticRegression(max_iter=3000)
log_clf.fit(x_train, train_data.label)

Wall time: 54.2 s


LogisticRegression(max_iter=3000)

In [36]:
log_clf.score(x_test, test_data.label)

0.8967

In [37]:
y_pred = log_clf.predict(x_test)
print(confusion_matrix(test_data.label, y_pred))

[[874  10  62   2   7  13  13   5   4  10]
 [ 11 904  21   3   4  14  10   3   5  25]
 [ 51  16 867   1  23   4  27   6   3   2]
 [  3   3   2 932   4  17  18   3   6  12]
 [  6   3  36   7 853  23  24   4  22  22]
 [  1   7   5  18  15 891  25   2   5  31]
 [  6   8  27   9  18  36 876   2   1  17]
 [  0   3   2   2   6  11  11 944   5  16]
 [  1   2   2   3  47  11   5   9 902  18]
 [  1   4   2   2  12  24   6  12  13 924]]


In [38]:
print(classification_report(test_data.label, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      1000
           1       0.94      0.90      0.92      1000
           2       0.85      0.87      0.86      1000
           3       0.95      0.93      0.94      1000
           4       0.86      0.85      0.86      1000
           5       0.85      0.89      0.87      1000
           6       0.86      0.88      0.87      1000
           7       0.95      0.94      0.95      1000
           8       0.93      0.90      0.92      1000
           9       0.86      0.92      0.89      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



随机梯度下降

In [39]:
%%time
sgd = SGDClassifier()
sgd.fit(x_train, train_data.label)

Wall time: 1.14 s


SGDClassifier()

In [40]:
sgd.score(x_test, test_data.label)

0.8804

In [41]:
y_pred = sgd.predict(x_test)
print(confusion_matrix(test_data.label, y_pred))

[[869  17  56   4   5  12  12   7   6  12]
 [ 15 906  15   5   2  13   8   7   9  20]
 [ 61  27 820   4  26   4  43  11   2   2]
 [  2   4   1 940   4  18  16   4   5   6]
 [  7   4  43  11 824  20  32   7  30  22]
 [  2   8   7  31  10 868  25   8   6  35]
 [  6  14  20  18  15  37 866   6   5  13]
 [  0   3   1   3   4  12  14 940   5  18]
 [  3   2   4   5  50  13   3  21 882  17]
 [  2   5   2   7  12  37   9  17  20 889]]


In [42]:
print(classification_report(test_data.label, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      1000
           1       0.92      0.91      0.91      1000
           2       0.85      0.82      0.83      1000
           3       0.91      0.94      0.93      1000
           4       0.87      0.82      0.84      1000
           5       0.84      0.87      0.85      1000
           6       0.84      0.87      0.85      1000
           7       0.91      0.94      0.93      1000
           8       0.91      0.88      0.90      1000
           9       0.86      0.89      0.87      1000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



随机森林

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
%%time
rdf = RandomForestClassifier()
rdf.fit(x_train, train_data.label)

Wall time: 11min 45s


RandomForestClassifier()

In [45]:
rdf.score(x_test, test_data.label)

0.8368

In [46]:
y_pred = rdf.predict(x_test)
print(confusion_matrix(test_data.label, y_pred))

[[834  14  87   2  11  15  12   4   7  14]
 [  8 883  39   8  11  10   9   4   8  20]
 [ 67  24 816   4  23   7  41   4   7   7]
 [  4   5   9 915   5  19  19   2  11  11]
 [ 11   9  72  16 779  24  27   4  39  19]
 [  3  20  31  31  16 811  35   1   9  43]
 [  8  16  50  25  23  43 797   5  10  23]
 [  1  18  15   8   8  16  21 863  17  33]
 [  5   6  18  12  47  12   9  14 864  13]
 [  5  12  27  13  20  39  21  24  33 806]]


In [47]:
print(classification_report(test_data.label, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86      1000
           1       0.88      0.88      0.88      1000
           2       0.70      0.82      0.75      1000
           3       0.88      0.92      0.90      1000
           4       0.83      0.78      0.80      1000
           5       0.81      0.81      0.81      1000
           6       0.80      0.80      0.80      1000
           7       0.93      0.86      0.90      1000
           8       0.86      0.86      0.86      1000
           9       0.81      0.81      0.81      1000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



AdaBoost

In [48]:
from sklearn.ensemble import AdaBoostClassifier

In [49]:
%%time
adb = AdaBoostClassifier(n_estimators=1000)
adb.fit(x_train, train_data.label)

Wall time: 3min 12s


AdaBoostClassifier(n_estimators=1000)

In [50]:
adb.score(x_test, test_data.label)

0.7425