### TF-IDF算法代码示例
1.引入依赖

2.定义数据和预处理

3.进行词数统计

4.计算词频 TF

5.计算逆文档频率 IDF

6.计算 TF-IDF

#### 1. 引入依赖

In [9]:
import numpy as np
import pandas as pd

#### 2. 定义数据和预处理

In [10]:
# 定义文档
docA = 'The cat sat on my bed'
docB = 'The dog sat on my knees'

# 切割文档
bowA = docA.split(" ")
bowB = docB.split(" ")

# bowA # ['The', 'cat', 'sat', 'on', 'my', 'bed']
# bowB # ['The', 'dog', 'sat', 'on', 'my', 'knees']

# 构建词库
wordSet = set(bowA).union(set(bowB))

# wordSet # {'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}

#### 3. 进行词频统计

In [11]:
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)
# wordDictA # {'knees': 0, 'bed': 0, 'on': 0, 'The': 0, 'dog': 0, 'my': 0, 'sat': 0, 'cat': 0}

# 遍历文档，统计词数
for word in bowA:
    wordDictA[word]+=1

for word in bowB:
    wordDictB[word]+=1

pd.DataFrame([wordDictA,wordDictB])

Unnamed: 0,The,bed,cat,dog,knees,my,on,sat
0,1,1,1,0,0,1,1,1
1,1,0,0,1,1,1,1,1


#### 4. 计算词频 TF

In [12]:
def computeTF(wordDict, bow):
    # 用一个字典对象保存 TF，把所有对应于 bow 文档里的 TF都计算出来
    tfDict = {}
    nbowCount = len(bow)

    for word,count in wordDict.items():
        tfDict[word] = count / nbowCount
    return tfDict

# 测试
tfA = computeTF(wordDictA,bowA)
tfB = computeTF(wordDictB,bowB)
print(tfA)
print(tfB)

{'knees': 0.0, 'bed': 0.16666666666666666, 'on': 0.16666666666666666, 'The': 0.16666666666666666, 'dog': 0.0, 'my': 0.16666666666666666, 'sat': 0.16666666666666666, 'cat': 0.16666666666666666}
{'knees': 0.16666666666666666, 'bed': 0.0, 'on': 0.16666666666666666, 'The': 0.16666666666666666, 'dog': 0.16666666666666666, 'my': 0.16666666666666666, 'sat': 0.16666666666666666, 'cat': 0.0}


#### 5. 计算逆文档概率 IDF

In [13]:
def computeIDF(wordDictList):
    # 用一个字典对象保存IDF，每个词作为key，初始值为0
    idfDict = dict.fromkeys(wordDictList[0],0)
    # 文档总量
    N = len(wordDictList)

    import math
    for wordDict in wordDictList:
        # 遍历字典中的每个词汇，统计Ni
        for word,count in wordDict.items():
            if count> 0:
                # 先把 Ni 增加 1，存入到 idfDict 中
                idfDict[word]+=1
    for word,Ni in idfDict.items():
        idfDict[word] = math.log10((N+1) / (Ni + 1))
    return idfDict

# 测试
idfs = computeIDF([wordDictA, wordDictB])

#### 6. 计算 TFIDF

In [14]:
def computeTFIDF(tf, idfs):
    tfIdf = {}
    for word,tfval in tf.items():
        tfIdf[word] = tfval * idfs[word]
    return tfIdf

# 测试
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

pd.DataFrame([tfidfA, tfidfB])

Unnamed: 0,The,bed,cat,dog,knees,my,on,sat
0,0.0,0.029349,0.029349,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.029349,0.029349,0.0,0.0,0.0
