# 1. 实例化TfidfVectorizer

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to CountVectorizer followed by TfidfTransformer.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetch the bytes in memory.
 |  
 |      Otherwise the input is expected to be the sequence strings or
 |      bytes items are expected to be analyzed directly.
 |  
 |  encoding : string, 'utf-8' by default.
 |      If bytes or files are given to analyze, this encoding is used to
 |      decode.
 |

In [2]:
tv = TfidfVectorizer()
tv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

# 2. 参数

- 计算tf

In [3]:
texts = ["Chinese Beijing Chinese",
         "Chinese Chinese Shanghai",
         "Chinese Macao",
         "Tokyo Japan Chinese"]
tv = TfidfVectorizer(use_idf=False, smooth_idf=None, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[1., 2., 0., 0., 0., 0.],
       [0., 2., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 1.]])

- 向量归一化

In [4]:
tv = TfidfVectorizer(use_idf=False, smooth_idf=False, norm='l2')
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[0.4472136 , 0.89442719, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.89442719, 0.        , 0.        , 0.4472136 ,
        0.        ],
       [0.        , 0.70710678, 0.        , 0.70710678, 0.        ,
        0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.        ,
        0.57735027]])

In [5]:
from math import log
from math import pow
import numpy as np
(1.0/np.sqrt(pow(1.0,2)+pow(2.0,2)))*np.array([1., 2., 0., 0., 0., 0.])

array([0.4472136 , 0.89442719, 0.        , 0.        , 0.        ,
       0.        ])

- 计算tf-idf，不加平滑

In [6]:
tv = TfidfVectorizer(use_idf=True, smooth_idf=False, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[2.38629436, 2.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 2.        , 0.        , 0.        , 2.38629436,
        0.        ],
       [0.        , 1.        , 0.        , 2.38629436, 0.        ,
        0.        ],
       [0.        , 1.        , 2.38629436, 0.        , 0.        ,
        2.38629436]])

In [7]:
1.0*(1+log(4/1))

2.386294361119891

In [8]:
2.0*(1+log(4/4))

2.0

- 计算tf-idf，加入平滑

In [9]:
tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
tv_fit = tv.fit_transform(texts)
print(tv.get_feature_names())
tv_fit.toarray()

['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']


array([[1.91629073, 2.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 2.        , 0.        , 0.        , 1.91629073,
        0.        ],
       [0.        , 1.        , 0.        , 1.91629073, 0.        ,
        0.        ],
       [0.        , 1.        , 1.91629073, 0.        , 0.        ,
        1.91629073]])

In [10]:
1.0*(1+log((4+1)/(1+1)))

1.916290731874155

In [11]:
 2.0*(1+log((4+1)/(4+1)))

2.0

# 3. 方法

- inverse_transform(X)：返回某篇训练文档向量中的非0特征值所对应的特征词

In [12]:
tv = TfidfVectorizer()
tv_fit = tv.fit_transform(texts)
tv.inverse_transform(tv_fit[0])

[array(['chinese', 'beijing'], dtype='<U8')]

- transform(X):返回测试文档的向量

In [13]:
tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
tv_fit = tv.fit_transform(texts)
print("tv_fit.toarray()=\n", tv_fit.toarray())
test_text = ["Chinese Chinese Chinese Tokyo Japan"]
X_test = tv.transform(test_text)
print(tv.get_feature_names())
print("X_test=\n", X_test.toarray())

tv_fit.toarray()=
 [[1.91629073 2.         0.         0.         0.         0.        ]
 [0.         2.         0.         0.         1.91629073 0.        ]
 [0.         1.         0.         1.91629073 0.         0.        ]
 [0.         1.         1.91629073 0.         0.         1.91629073]]
['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
X_test=
 [[0.         3.         1.91629073 0.         0.         1.91629073]]


In [14]:
# chinese计算
3.0*(1+log((1+4)/(1+4)))

3.0

In [15]:
# japan计算
1.0*(1+log((1+4)/(1+1)))

1.916290731874155