#### 获取原始数据

In [4]:
# 原始数据下载链接 https://ai.stanford.edu/~amaas/data/sentiment/
"""
import os
import tarfile

target = '../Python-MachineLearning/Datasets/IMDB_data/aclImdb_v1.tar.gz'

if not os.path.isdir('../Python-MachineLearning/Datasets/IMDB_data/aclImdb'):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()
"""

"\nimport os\nimport tarfile\n\ntarget = '../Python-MachineLearning/Datasets/IMDB_data/aclImdb_v1.tar.gz'\n\nif not os.path.isdir('../Python-MachineLearning/Datasets/IMDB_data/aclImdb'):\n\n    with tarfile.open(target, 'r:gz') as tar:\n        tar.extractall()\n"

#### 将解压提取的数据转换为DataFrame

In [5]:
import pyprind
import pandas as pd
import os
import sys
from packaging import version


# # change the `basepath` to the directory of the
# # unzipped movie dataset

# basepath = r'C:/Users/Administrator/Desktop/aclImdb/'

# labels = {'pos': 1, 'neg': 0}

# # if the progress bar does not show, change stream=sys.stdout to stream=2
# pbar = pyprind.ProgBar(50000, stream=sys.stdout)

# df = pd.DataFrame()
# for s in ('test', 'train'):
#     for l in ('pos', 'neg'):
#         path = os.path.join(basepath, s, l)
#         for file in sorted(os.listdir(path)):
#             with open(os.path.join(path, file), 
#                       'r', encoding='utf-8') as infile:
#                 txt = infile.read()
                
#             if version.parse(pd.__version__) >= version.parse("1.3.2"):
#                 x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])
#                 df = pd.concat([df, x], ignore_index=False)

#             else:
#                 df = df.append([[txt, labels[l]]], 
#                                ignore_index=True)
#             pbar.update()
# df.columns = ['review', 'sentiment']

In [6]:
# 将处理好的数据保存到本地CSV.
# print(pd.__version__)       # 1.3.2

# import numpy as np

# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))
# df.to_csv('C:/Users/Administrator/Desktop/movie_data.csv', index=False, encoding='utf-8')

In [7]:
# 从Datasets/Imdb_data目录下读取处理好的IMDB数据集.
df = pd.read_csv('../Python-MachineLearning/Datasets/Imdb_data/movie_data.zip', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [8]:
print(df.shape)

(50000, 2)


#### 引言--介绍词袋模型

In [9]:
 # 将词转换为特征向量.
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)

In [10]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [11]:
# 每一列是一个特征(词)向量， 第一列为"and"，它在第三个句子里面出现了2次.
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


#### 通过词频-逆文档指数评估词的相关性

有时候，一些词会频繁出现在多个文档中，这些频繁词通常不包含有用或具有区分度(discriminatory)的信息.所以可以通过使用`term frequency-inverse document frequency(tf-idf)`， 它可以降低特征向量中对应的高频词的权重。其定义如下：

$$tf-idf(t,d)=tf(t,d)\times idf(t,d)$$

其中，$tf(t,d)$是特定的term $t$在文档中出现的频次， $idf(t,d)$是逆文档频率，其定义如下：

$$idf(t,d)=\log\frac{n_d}{1+df(d,t)}$$

其中，$n_d$是文档总个数，$df(d,t)$是包含有term $t$的文档数量.

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag))

  (0, 1)	0.4337078595086741
  (0, 3)	0.5584778353707552
  (0, 4)	0.5584778353707552
  (0, 6)	0.4337078595086741
  (1, 1)	0.4337078595086741
  (1, 5)	0.5584778353707552
  (1, 6)	0.4337078595086741
  (1, 8)	0.5584778353707552
  (2, 0)	0.5023864481078999
  (2, 1)	0.4450762939064939
  (2, 2)	0.5023864481078999
  (2, 3)	0.19103892151222399
  (2, 4)	0.19103892151222399
  (2, 5)	0.19103892151222399
  (2, 6)	0.29671752927099593
  (2, 7)	0.25119322405394995
  (2, 8)	0.19103892151222399


In [13]:
# 所以，上面的输出结果中，仅包含有非零项.
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


#### sklearn中tf-idf的实现

sklearn中tf-idf的实现如下：

$$idf(t,d)=\log\frac{1+n_d}{1+df(d,t)}$$

对应的有：

$$tf-idf(t,d)=tf(t,d)\times(idf(t,d)+1)$$

这里分子的加1对应于上述参数设置"smooth_idf=True". 它的效果是为出现在所有文档中的词赋予 权重 零.

此外，除了可以对原始词频进行归一化，TfidfTransformer类会直接对tf-idfs进行归一化，对应于上述参数"norm=l2", 其定义如下：

$$v_{norm}=\frac{v}{\|v\|_{2}}=\frac{v}{\sqrt{v_{1}^{2}+v_{2}^{2}+\cdots+v_{n}^{2}}}=\frac{v}{(\sum_{i=1}^{n}v_{i}^{2})^{1/2}}$$

#### 一个计算示例

以"is"为例：

$$idf("\text{is"},d_3)=\log\frac{1+3}{1+3}=0$$

所以有：
$$tf-idf("is",d_3)=3\times(0+1)=3$$

重复上述计算过程，得到tf-idf向量为：$\text{vectors: [3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0, 1.69, 1.29]}$

所以有在进行L2归一化之后得到：

$$\begin{aligned}tf-idf(d_{3})_{norm}&=\quad\frac{[3.39,3.0,3.39,1.29,1.29,1.29,2.0,1.69,1.29]}{\sqrt{3.39^2+3.0^2+3.39^2+1.29^2+1.29^2+1.29^2+2.0^2+1.69^2+1.29^2}}\\&=\quad[0.5,0.45,0.5,0.19,0.19,0.19,0.3,0.25,0.19]\end{aligned}$$

同时，可以看到：
$$tf-idf("is",d_3)\quad=\quad0.45$$

#### 文本数据清洗

In [14]:
# 一个示例.
df = pd.read_csv('../Python-MachineLearning/Datasets/Imdb_data/movie_data.zip', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [20]:
# df.columns  # Index(['review', 'sentiment'], dtype='object')

df.loc[215, 'review']

"This movie is an evolutionary piece - from Terminator to Robocop .<br /><br />Stan Winston did the SPFX !<br /><br />In this film, a scientist working in a sinister robotics company with a really creepy boss(they always are) gets is killed by them in a horrible lab explosion and has his brain placed inside an indestructible robot body .<br /><br />The rest of this movie goes on with a romance angle as this Cyborg/Man regains consciousness and wreaks havoc while trying to communicate with his wife, played by the gorgeous(back then in 1986) Terri Austin . (He tries to reconnect with his old life, like in that scene in RoboCop)<br /><br />The rest of this movie is about breaking things, while trying to defeat the evil his evil boss from recapturing him for some ill-defined 'turn humans into cyborgs' project .<br /><br />This film pays homage to previous movies like THE DAY THE EARTH STOOD STILL - - as the cyborg breaks free like the giant robot Gort does .<br /><br />Except for the 'Fran

In [21]:
import re
def preprocessor(text):
    """
    去除文本中的HTML标签， <[^>]*>匹配< 和 >之间的内容，将其替换为空字符;
    """
    text = re.sub('<[^>]*>', '', text)
    """
    查找文本中的表情符号，然后存在emoticons列表中;
    """
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    """
    将文本转换为小写，并使用re.sub()函数将文本中的非单词字符(即不是字母、数字或下划线的字符)替换为空格；
    同时将之前找到的表情符号列表连接成一个字符串(使用空格分隔),并将其中的"-"替换为空字符串，然后将这个
    字符串与处理后的文本连接起来.
    """
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [22]:
preprocessor(df.loc[215, 'review'])

'this movie is an evolutionary piece from terminator to robocop stan winston did the spfx in this film a scientist working in a sinister robotics company with a really creepy boss they always are gets is killed by them in a horrible lab explosion and has his brain placed inside an indestructible robot body the rest of this movie goes on with a romance angle as this cyborg man regains consciousness and wreaks havoc while trying to communicate with his wife played by the gorgeous back then in 1986 terri austin he tries to reconnect with his old life like in that scene in robocop the rest of this movie is about breaking things while trying to defeat the evil his evil boss from recapturing him for some ill defined turn humans into cyborgs project this film pays homage to previous movies like the day the earth stood still as the cyborg breaks free like the giant robot gort does except for the frankenstein suite designed by stan winston this movie s production values are typically canadian s

In [23]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [24]:
# 将上述定义好的处理函数应用于df数据集的‘review’列.
df['review'] = df['review'].apply(preprocessor)

#### 将文档处理成tokens.

In [25]:
# 一个示例.
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

在tokenization处理中，另一种有用的方法是词干提取(word stemming), 也就是将单词转换为其词根形式的过程.代码如下：

In [26]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

#### 去除停用词

In [27]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to D:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

#### 利用Logistic Regression进行文档分类

In [30]:
# 划分训练集和测试集.
X_train = df.loc[:25000, 'review'].values
X_test = df.loc[25000:, 'review'].values

y_train = df.loc[:25000, 'sentiment'].values
y_test = df.loc[25000:, 'sentiment'].values

In [31]:
# 使用网格搜索对Logistic Regression进行调参.
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

"""
TfidfVectorizer是对CountVectorizer和TfidfTransformer的组合.
"""
tfidf = TfidfVectorizer(strip_accents=None, 
                        lowercase=False, 
                        preprocessor=None)

small_param_grid = [{'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [None],
                     'vect__tokenizer': [tokenizer, tokenizer_porter],
                     'clf__penalty': ['l2'],
                     'clf__C': [1.0, 10.0]},
                    {'vect__ngram_range': [(1, 1)],
                     'vect__stop_words': [stop, None],
                     'vect__tokenizer': [tokenizer],
                     'vect__use_idf':[False],
                     'vect__norm':[None],
                     'clf__penalty': ['l2'],
                  'clf__C': [1.0, 10.0]},
              ]

"""
这里采用solver='liblinear'，因为对相对较大的数据集，它的表现比默认选择('lbfgs')更好.
"""
lr_tfidf = Pipeline([('vect', tfidf), 
                     ('clf', LogisticRegression(solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, 
                           scoring='accuracy', 
                           cv=5, 
                           verbose=1, 
                           n_jobs=-1)

In [32]:
# 拟合训练.
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  arr = np.array(param_list)


In [33]:
# 获取最佳参数组合.
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001F19DF94AF0>}


In [34]:
# 获取交叉验证结果.
print(f'CV Accuracy: {gs_lr_tfidf.best_score_: .3f}')

CV Accuracy:  0.873


In [35]:
# 获取模型在测试集上的结果.
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test): .3f}')

Test Accuracy:  0.881


#### 处理更大规模的数据--online algorithm and out of core learning

上面的训练过程非常耗时. 针对这种情况，可以应用一种被称为"out-of-core learning"的技术，它可以使我们在较小批次的数据集上逐步拟合分类器来处理庞大的数据集.

In [36]:
# 首先进行文本数据清洗、分词和去除停用词.
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

"""
逐行读取指定路径的CSV文件,同时返回每行中的文本和标签.
"""
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # 跳过文件第一行.
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [38]:
# 验证stream_docs函数.
next(stream_docs(path="C:/Users/Administrator/Desktop/movie_data/movie_data.csv"))

('"I went and saw this movie last night after being coaxed to by a few friends of mine. I\'ll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."',
 1)

In [40]:
# 定义一个get_minibatch函数，从stream_docs函数中获取文档流并返回特定数量的文档.
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

需要注意的是，不能使用CountVectorizer进行out-of-core学习. 因为它需要再内存中保存完整的词汇表.

但是，可以使用另外一个有用的工具： HashingVectorizer.

In [43]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21, 
                         preprocessor=None, 
                         tokenizer=tokenizer)

"""
InvalidParameterError: The 'loss' parameter of SGDClassifier must be a str among {'squared_epsilon_insensitive', 
'squared_error', 'squared_hinge', 'hinge', 'modified_huber', 'huber', 'epsilon_insensitive', 'perceptron', 'log_loss'}. 
Got 'log' instead.
"""
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path="C:/Users/Administrator/Desktop/movie_data/movie_data.csv")

In [44]:
# 使用如下代码开始核外学习.
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:30


In [45]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")

Accuracy: 1.000


In [46]:
# 最后，使用后面的5000个文档来更新模型.
clf = clf.partial_fit(X_test, y_test)

#### 使用隐狄利克雷进行主题建模

主题建模描述了将主题分配给未标记的文本文档的广泛任务。 可以将主题建模视为聚类任务，即无监督学习的一个子类别。给定一个词袋作为输入，LDA(latent Direchlet allocation)将其分解为两个新矩阵：

* A document-to-topic matrix;
* A word-to-topic matrix;

利用sklearn实现LDA的过程如下：

In [47]:
# 在下面的示例中，将分析限制为10个主题.
import pandas as pd

df = pd.read_csv('../Python-MachineLearning/Datasets/Imdb_data/movie_data.zip', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', 
                        max_df=.1, 
                        max_features=5000)

X = count.fit_transform(df['review'].values)

In [49]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, 
                                random_state=123, 
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [50]:
# 获取lda实例的components_属性，它存储了一个矩阵，其中包括按升序排列的10个主题中每个主题的单词重要性;
lda.components_.shape

(10, 5000)

In [51]:
# 打印10个主题前5个最重要的单词.
n_top_words = 5
feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    print(' '.join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

Topic 1:
worst minutes script awful stupid
Topic 2:
family mother father girl children
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art feel
Topic 5:
police guy car murder dead
Topic 6:
horror house gore blood sex
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes tv
Topic 9:
book version original read effects
Topic 10:
action fight guy guys fun


In [52]:
horror = X_topics[:, 5].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f'\nHorror movie #{(iter_idx + 1)}:')
    print(df['review'][movie_idx][:300], '...')


Horror movie #1:
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

Horror movie #2:
"House of the Damned" (also known as "Spectre") is one of your low budget haunted house horror flicks, filled with mediocre performances and cheap effects. It is about a family that inherits an old Irish mansion, and after moving in begin to experience strange phenomenon and ghostly apparitions, inc ...

Horror movie #3:
This film marked the end of the "serious" Universal Monsters era (Abbott and Costello meet up with the monsters later in "Abbott and Costello Meet Frankentstein"). It was a somewhat desparate, yet fun attempt to revive the classic monsters of the Wolf Man, Frankenstein's monster, and Dracula one "la ...
