#### 获取原始数据

In [12]:
# 原始数据下载链接 https://ai.stanford.edu/~amaas/data/sentiment/
"""
import os
import tarfile

target = '../Python-MachineLearning/Datasets/IMDB_data/aclImdb_v1.tar.gz'

if not os.path.isdir('../Python-MachineLearning/Datasets/IMDB_data/aclImdb'):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()
"""

"\nimport os\nimport tarfile\n\ntarget = '../Python-MachineLearning/Datasets/IMDB_data/aclImdb_v1.tar.gz'\n\nif not os.path.isdir('../Python-MachineLearning/Datasets/IMDB_data/aclImdb'):\n\n    with tarfile.open(target, 'r:gz') as tar:\n        tar.extractall()\n"

#### 将解压提取的数据转换为DataFrame

In [13]:
import pyprind
import pandas as pd
import os
import sys
from packaging import version


# # change the `basepath` to the directory of the
# # unzipped movie dataset

# basepath = r'C:/Users/Administrator/Desktop/aclImdb/'

# labels = {'pos': 1, 'neg': 0}

# # if the progress bar does not show, change stream=sys.stdout to stream=2
# pbar = pyprind.ProgBar(50000, stream=sys.stdout)

# df = pd.DataFrame()
# for s in ('test', 'train'):
#     for l in ('pos', 'neg'):
#         path = os.path.join(basepath, s, l)
#         for file in sorted(os.listdir(path)):
#             with open(os.path.join(path, file), 
#                       'r', encoding='utf-8') as infile:
#                 txt = infile.read()
                
#             if version.parse(pd.__version__) >= version.parse("1.3.2"):
#                 x = pd.DataFrame([[txt, labels[l]]], columns=['review', 'sentiment'])
#                 df = pd.concat([df, x], ignore_index=False)

#             else:
#                 df = df.append([[txt, labels[l]]], 
#                                ignore_index=True)
#             pbar.update()
# df.columns = ['review', 'sentiment']

In [14]:
# 将处理好的数据保存到本地CSV.
# print(pd.__version__)       # 1.3.2

# import numpy as np

# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))
# df.to_csv('C:/Users/Administrator/Desktop/movie_data.csv', index=False, encoding='utf-8')

In [16]:
# 从Datasets/Imdb_data目录下读取处理好的IMDB数据集.
df = pd.read_csv('../Python-MachineLearning/Datasets/Imdb_data/movie_data.zip', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [17]:
print(df.shape)

(50000, 2)


#### 引言--介绍词袋模型

In [18]:
 # 将词转换为特征向量.
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)

In [19]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [20]:
# 每一列是一个特征(词)向量， 第一列为"and"，它在第三个句子里面出现了2次.
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


#### 通过词频-逆文档指数评估词的相关性

有时候，一些词会频繁出现在多个文档中，这些频繁词通常不包含有用或具有区分度(discriminatory)的信息.所以可以通过使用`term frequency-inverse document frequency(tf-idf)`， 它可以降低特征向量中对应的高频词的权重。其定义如下：

$$tf-idf(t,d)=tf(t,d)\times idf(t,d)$$

其中，$tf(t,d)$是特定的term $t$在文档中出现的频次， $idf(t,d)$是逆文档频率，其定义如下：

$$idf(t,d)=\log\frac{n_d}{1+df(d,t)}$$

其中，$n_d$是文档总个数，$df(d,t)$是包含有term $t$的文档数量.

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag))

  (0, 1)	0.4337078595086741
  (0, 3)	0.5584778353707552
  (0, 4)	0.5584778353707552
  (0, 6)	0.4337078595086741
  (1, 1)	0.4337078595086741
  (1, 5)	0.5584778353707552
  (1, 6)	0.4337078595086741
  (1, 8)	0.5584778353707552
  (2, 0)	0.5023864481078999
  (2, 1)	0.4450762939064939
  (2, 2)	0.5023864481078999
  (2, 3)	0.19103892151222399
  (2, 4)	0.19103892151222399
  (2, 5)	0.19103892151222399
  (2, 6)	0.29671752927099593
  (2, 7)	0.25119322405394995
  (2, 8)	0.19103892151222399


In [24]:
# 所以，上面的输出结果中，仅包含有非零项.
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


#### sklearn中tf-idf的实现

sklearn中tf-idf的实现如下：

$$idf(t,d)=\log\frac{1+n_d}{1+df(d,t)}$$

对应的有：

$$tf-idf(t,d)=tf(t,d)\times(idf(t,d)+1)$$

这里分子的加1对应于上述参数设置"smooth_idf=True". 它的效果是为出现在所有文档中的词赋予 权重 零.

此外，除了可以对原始词频进行归一化，TfidfTransformer类会直接对tf-idfs进行归一化，对应于上述参数"norm=l2", 其定义如下：

$$v_{norm}=\frac{v}{\|v\|_{2}}=\frac{v}{\sqrt{v_{1}^{2}+v_{2}^{2}+\cdots+v_{n}^{2}}}=\frac{v}{(\sum_{i=1}^{n}v_{i}^{2})^{1/2}}$$

#### 一个计算示例

以"is"为例：

$$idf("\text{is"},d_3)=\log\frac{1+3}{1+3}=0$$

所以有：
$$tf-idf("is",d_3)=3\times(0+1)=3$$

重复上述计算过程，得到tf-idf向量为：$\text{vectors: [3.39, 3.0, 3.39, 1.29, 1.29, 1.29, 2.0, 1.69, 1.29]}$

所以有在进行L2归一化之后得到：

$$\begin{aligned}tf-idf(d_{3})_{norm}&=\quad\frac{[3.39,3.0,3.39,1.29,1.29,1.29,2.0,1.69,1.29]}{\sqrt{3.39^2+3.0^2+3.39^2+1.29^2+1.29^2+1.29^2+2.0^2+1.69^2+1.29^2}}\\&=\quad[0.5,0.45,0.5,0.19,0.19,0.19,0.3,0.25,0.19]\end{aligned}$$

同时，可以看到：
$$tf-idf("is",d_3)\quad=\quad0.45$$

#### 文本数据清洗