In [1]:
import pandas as pd
import numpy as np
import re
import nltk

## 构造一个文本数据集

In [2]:
corpus = ['The sky is blue and beautiful.',
         'Love this blue and beautiful sky!',
         'The quick brown fox jumps over the lazy dog.',
         'The brown fox is quick and the blue dog is lazy!',
         'The sky is very blue and the sky is very beautiful today',
         'The dog is layz but the brown fox is quick!']

labels = ['weather','weather','animals','animals','weather','animals',]
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus,
                         'Category': labels})
corpus_df = corpus_df[['Document','Category']]
corpus_df  # 有标签，如每句话的主题

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is layz but the brown fox is quick!,animals


任务：分类任务，基于一句话分类成相应的标签

## 基本预处理

In [3]:
nltk.download()  # 下载失败的用这个方法https://blog.csdn.net/qq_37891889/article/details/104418106

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [10]:
# 词频与停用词
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

停用词：

这里面除了天气和动物信息，其它都基本没用，如i me my等等这些词，这些相当于停用词

In [11]:
def normalize_document(doc):
    # 预处理
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)  # 去掉多余字符
    doc = doc.lower()  # 统一转小写
    doc = doc.strip()  # 去空格
    # 分词，切分提取全部词
    tokens = wpt.tokenize(doc)  
    # 查找停用词，并过滤
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # 拼接所有的词
    doc = ' '.join(filtered_tokens)
    return doc


normalize_corpus = np.vectorize(normalize_document)

In [8]:
norm_corpus = normalize_corpus(corpus)
norm_corpus  # 处理完成的结果

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog layz brown fox quick'],
      dtype='<U30')

## 词袋模型
将所有词语装进一个袋子里,不考虑其词法和语序的问题,即每个词语都是独立的。

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
print(norm_corpus)
# 利用语句中的词，构建成一个词汇表。
# min_df表示词频率小于多少不会被当做关键词，max_df则相反
# 直接API文档https://scikit-learn.org/stable/modules/classes.html，搜CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.)
cv.fit(norm_corpus)
print(cv.get_feature_names())
# 构建向量表
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

['sky blue beautiful' 'love blue beautiful sky'
 'quick brown fox jumps lazy dog' 'brown fox quick blue dog lazy'
 'sky blue sky beautiful today' 'dog layz brown fox quick']
['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'layz', 'lazy', 'love', 'quick', 'sky', 'today']


array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

如上数据中，在词汇表中，是否出现过，有则在相应位置标记为1，有两个则标记为2。

如：第一句话'sky blue beautiful'，在词汇表有第一个词一次，第二个词一次，导数第二个词一次，那么下面的向量表则是[1,1,...,1,0]