In [1]:
# 关于简单的NLP项目的实例
# 由于数据具有敏感性，所以数据并不上传，主要是其结构的变化。

In [2]:
# -*- coding:utf-8 -*-

In [3]:
import os
import xlrd
import numpy as np
import jieba
import jieba.analyse
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
BASEDIR = os.getcwd()

In [5]:
FILENAME = 'repository.xls'
SHEETNAME = 'main'

In [6]:
# 加载数据集
def load_data(filename,sheetname):
    '''
    从excel中加载数据集。
    入参：filename:文件名
         sheetname:工作簿名
    返回值：repository 知识库
    '''
    filepath = os.path.join(BASEDIR,filename)
    work_book = xlrd.open_workbook(filename)
    sheet = work_book.sheet_by_name(sheetname)
    row = 1
    col = 4
    repository = {}
    index = 0
    flag = True
    if flag:
        while row < 491:
            while col < 30:
                key = sheet.cell(row,4).value
                if key.strip():
                    text = sheet.cell(row,col).value
                    sub_dict = {}
                    if text.strip():
                        sub_dict['question'] = text.strip()
                        sub_dict['knowledge_id'] = sheet.cell(row,0).value
                        repository[index] = sub_dict
                        index +=1
                if col == 4:
                    col = 7
                col +=1
            col = 4
            row +=1
        return repository

In [7]:
repository = load_data(FILENAME,SHEETNAME)

In [8]:
len(repository)

3568

In [9]:
# 查看repository的格式和内容
repository[0]

{'knowledge_id': 'A0000', 'question': '通知：Concur App发生故障，登陆提示Error403或错误的公司代码'}

In [10]:
# 数据预处理
def preprocessing(ques):
    '''
    问题字母小写化,并去除里面的换行符
    '''
    return ques.lower().replace('\n','')

In [11]:
with open('question.txt','w') as f:
    for k,v in repository.items():
        repository[k]['question'] = preprocessing(repository[k]['question'])
        f.write(repository[k]['question'])
        f.write('\n')

In [12]:
with open('question.txt','r') as fr:
    contents = fr.read()

In [13]:
# 分词（ 自定义词汇，停用词表）
# 添加用户自定义词典
jieba.load_userdict('userdict.txt')
# 设置停用词表
jieba.analyse.set_stop_words('stop_words.txt')
# 获取去除停用词的关键词
keywords = jieba.analyse.extract_tags(contents, topK = 2000)
print('Words Number:',len(keywords))
stopwords = open('stop_words.txt').read()
# 分词
with open('segments.txt','w') as fs:
    for k,v in repository.items():
        segments = jieba.cut(repository[k]['question'],cut_all =True)
        for i in segments:
            if i not in stopwords:
                fs.write(i)
                fs.write('\t')
        fs.write('\n')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w2/qnnfb62x2g760j3nkh9q4ptr0000gn/T/jieba.cache
Loading model cost 0.681 seconds.
Prefix dict has been built succesfully.


Words Number: 1916


In [14]:
corpus = []
with open('segments.txt','r') as fr:
    for line in fr:
        corpus.append(line)

In [15]:
corpus[:5]

['通知\tconcur\tapp\t发生\t故障\t登陆\t提示\terror403\t错误\t公司\t代码\t\n',
 'concur\tapp\t登陆\t\n',
 'concur\t手机\t手机端\t登陆\t\n',
 'concur\t手机\t手机端\t登陆\t403\t报错\t\n',
 'concur\tapp\t登陆\t提示\t公司\t代码\t不正\t正确\t\n']

In [16]:
# 三种文本特征提取（TF-IDF/Word2Vec/CountVectorizer）

In [17]:
# 第一种：TF-IDF值，
# CountVectorizer 构建词频矩阵
# TfidfTransformer 构建tfidf权值计算
# 文本的关键字
# 对应的tfidf矩阵

In [18]:
# CountVectorizer 构建词频矩阵
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X[:1,:])
X.shape # 3576个问题*2451个词汇

  (0, 424)	1
  (0, 533)	1
  (0, 2364)	1
  (0, 56)	1
  (0, 1427)	1
  (0, 1829)	1
  (0, 1468)	1
  (0, 779)	1
  (0, 19)	1
  (0, 38)	1
  (0, 2304)	1


(3568, 2451)

In [19]:
# 构建TF-IDF权值计算
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
print(tfidf[:1,:])
print(tfidf.shape)

  (0, 2304)	0.27379809813895223
  (0, 38)	0.2156030330282288
  (0, 19)	0.26086682456934407
  (0, 779)	0.3455812051435754
  (0, 1468)	0.43244588601264666
  (0, 1829)	0.20728582808083776
  (0, 1427)	0.20852868372690628
  (0, 56)	0.43244588601264666
  (0, 2364)	0.2880807601116773
  (0, 533)	0.25746859524213583
  (0, 424)	0.2851682785551384
(3568, 2451)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [20]:
# 获取词袋模型中的关键词
word = vectorizer.get_feature_names()
# tfidf矩阵
weight = tfidf.toarray()
print(weight.shape)

(3568, 2451)


In [21]:
np.set_printoptions(threshold=np.inf)
weight[0,2307] # 可以通过验证矩阵中的值来检测

0.0

In [22]:
# 构造Y向量
labels = []
for k,v in repository.items():
    labels.append(v['knowledge_id'])
labels = list(set(labels)) #490个label

for k,v in repository.items():
    repository[k]['y_index'] = labels.index(v['knowledge_id'])

Y = np.zeros(X.shape[0])
for k,v in repository.items():
    Y[k] = v['y_index']

In [54]:
# 建立贝叶斯模型
clf = MultinomialNB(0.1)
# 参数alpha的是平滑因子
clf.fit(weight,Y)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [57]:
# 获取在训练集上的准确度
def train_accuracy():
    count=0
    correct_count = 0
    for ele in weight:
        train_test_vec = ele.reshape((1,a.shape[0]))  # clf.predict()需要的向量为(1,2451)的形状
        answer = clf.predict(train_test_vec)[0]
        if int(answer)==repository[count]['y_index']:
            correct_count +=1
        count +=1
    # 计算准确度
    accuracy = correct_count/count
    print('Training Accuracy:',accuracy)
train_accuracy()

Training Accuracy: 0.9588004484304933


In [81]:
# 加载测试集
filename = 'test_data.xlsx'
sheetname = '2834'
def load_test_data(filename,sheetname):
    work_book = xlrd.open_workbook(filename)
    sheet = work_book.sheet_by_name(sheetname)
    row = 1
    col = 1
    testset = {}
    index = 0
    while row < 2385:
        key = sheet.cell(row,col).value
        if isinstance(key,str):  
            if key.strip():
                text = sheet.cell(row,col).value
                sub_dict = {}
                if text.strip():
                    sub_dict['question'] = text.strip() if isinstance(text,str) else str(text)
                    sub_dict['knowledge_id'] = sheet.cell(row,col+1).value
                    testset[index] = sub_dict
                    index +=1
        row +=1
    return testset
testset = load_test_data(filename, sheetname)

In [83]:
# 预处理
for k,v in testset.items():
    testset[k]['question'] = preprocessing(v['question'])
# 用自定义词库和中止词等，进行分词并向量化。


{0: {'knowledge_id': 'C0003', 'question': 'password怎么使用'},
 1: {'knowledge_id': 'C0001', 'question': '账号能能重置吗'},
 2: {'knowledge_id': 'J0001', 'question': '您好！能发我下国旅运通订票网址吗？'},
 3: {'knowledge_id': 'J0001', 'question': '差旅预定系统的网址有么'},
 4: {'knowledge_id': 'I0001', 'question': '掌上az的下载链接可以给我一下么？'},
 5: {'knowledge_id': 'G0066', 'question': '你好，现在活动关闭，讲者协议第一张需要重新打印。如何处理'},
 6: {'knowledge_id': 'D0004', 'question': '您好，有没有阿凡达网页版链接？'},
 7: {'knowledge_id': 'A0110', 'question': '7575电话'},
 8: {'knowledge_id': 'A0039', 'question': '怎么联系上扫描中心'},
 9: {'knowledge_id': 'H0001', 'question': '澳门年会'},
 10: {'knowledge_id': 'H0001', 'question': '年会议程'},
 11: {'knowledge_id': 'D0014',
  'question': '请把q4季度3k考试?ria-普米克令舒+信必可的考试重新推送给我一下可以吗'},
 12: {'knowledge_id': 'D0016', 'question': '只收到了课程过期的邮件啊'},
 13: {'knowledge_id': 'E0022', 'question': '嫌疑拜访比例不能超过多少？'},
 14: {'knowledge_id': 'E0001', 'question': 'concur的公司代码是多少？谢谢'},
 15: {'knowledge_id': 'E0022', 'question': '请问嫌疑拜访比例不能超过多少？'},
 16: {'knowledg

In [73]:
print(type(str(7575.0)))

<class 'str'>


In [None]:
# 第二种文本特征提取方式：Word2Vec

In [None]:
# 第三种文本特征提取方式：CountVectorizer-One Hot类型的，此处就不取了