# 文本分析

## 基本原理

## 常用工具

### jieba

In [2]:
import jieba  # jieba适用中文场景

In [4]:
text = "此时相望不相闻，愿逐月华流照君。"

# jieba.cut直接得到generator形式的分词结果
seg = jieba.cut(text)
print(' '.join(seg))

# 也可以使用jieba.lcut得到list的分词结果
seg = jieba.lcut(text)
print(seg)

此时 相望 不 相闻 ， 愿逐 月华 流照 君 。
['此时', '相望', '不', '相闻', '，', '愿逐', '月华', '流照', '君', '。']


### nltk

In [8]:
import nltk  # nltk适用英文场景
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

word_tokenize('hello world.')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\IKAS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['hello', 'world', '.']

### pyhanlp

In [9]:
from pyhanlp import *

ModuleNotFoundError: No module named 'pyhanlp'

### sklearn

In [17]:
#LDA主题分类（聚类）
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

data = pd.read_csv('../data/sentence.csv',encoding = 'gb2312')
text = list(data['sentence'].values) 
 
docs = [jieba.lcut(doc) for doc in text] #分词
stopwords = ['we','i','to','and','not','is','you','.','?','of'] #停用词表
docs = [[w for w in doc if w not in stopwords ] for doc in docs] 
docs = [[s for s in doc if s != ' '] for doc in docs] #去掉多余的空格
print("去除停用词：\n",docs)

corpus = [' '.join(doc) for doc in docs]#处理之后，每个文档的词语列表，加上空格
print("分词：\n",corpus)
 
tfidf = TfidfVectorizer() #TF-IDF词频统计
tfidf_matrix = tfidf.fit_transform(corpus)
print("词频统计矩阵:\n",tfidf_matrix)  #词频统计
print("稀疏矩阵：\n",tfidf_matrix.toarray()) #稀疏矩阵

lda = LatentDirichletAllocation(n_components=2,random_state=123456) #n_components，话题数K等于2，也就是聚类数为2
docres = lda.fit_transform(tfidf_matrix)
print("分类结果矩阵：\n",docres)
print("属于第一类的概率：{0}".format(docres[:,0]))
print("属于第二类的概率：{0}".format(docres[:,1]))

去除停用词：
 [['Once', 'dreamt', 'that', 'were', 'strangers', 'We', 'wake', 'up', 'find', 'that', 'were', 'dear', 'each', 'other'], ['We', 'come', 'nearest', 'the', 'great', 'when', 'are', 'great', 'in', 'humility'], ['My', 'heart', ',', 'the', 'bird', 'the', 'wilderness', ',', 'has', 'found', 'its', 'sky', 'in', 'your', 'eyes'], ['The', 'perfect', 'decks', 'itself', 'in', 'beauty', 'for', 'the', 'love', 'the', 'Imperfect'], ['What', 'are', 'do', 'see', ',', 'what', 'see', 'your', 'shadow'], ['Like', 'the', 'meeting', 'the', 'seagulls', 'the', 'waves', 'meet', 'come', 'near', 'The', 'seagulls', 'fly', 'off', ',', 'the', 'waves', 'roll', 'away', 'depart']]
分词：
 ['Once dreamt that were strangers We wake up find that were dear each other', 'We come nearest the great when are great in humility', 'My heart , the bird the wilderness , has found its sky in your eyes', 'The perfect decks itself in beauty for the love the Imperfect', 'What are do see , what see your shadow', 'Like the meeting the se