In [1]:
# 文本样例
text1 ="""
Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. 
Unqualified, the word football is understood to refer to whichever form of football is the most popular 
in the regional context in which the word appears. Sports commonly called football in certain places 
include association football (known as soccer in some countries); gridiron football (specifically American 
football or Canadian football); Australian rules football; rugby football (either rugby league or rugby union); 
and Gaelic football. These different variations of football are known as football codes.
"""

text2 = """
Basketball is a team sport in which two teams of five players, opposing one another on a rectangular court, 
compete with the primary objective of shooting a basketball (approximately 9.4 inches (24 cm) in diameter) 
through the defender's hoop (a basket 18 inches (46 cm) in diameter mounted 10 feet (3.048 m) high to a backboard 
at each end of the court) while preventing the opposing team from shooting through their own hoop. A field goal is 
worth two points, unless made from behind the three-point line, when it is worth three. After a foul, timed play stops 
and the player fouled or designated to shoot a technical foul is given one or more one-point free throws. The team with 
the most points at the end of the game wins, but if regulation play expires with the score tied, an additional period 
of play (overtime) is mandated.
"""

text3 = """
Volleyball, game played by two teams, usually of six players on a side, in which the players use their hands to bat a 
ball back and forth over a high net, trying to make the ball touch the court within the opponents’ playing area before 
it can be returned. To prevent this a player on the opposing team bats the ball up and toward a teammate before it touches 
the court surface—that teammate may then volley it back across the net or bat it to a third teammate who volleys it across 
the net. A team is allowed only three touches of the ball before it must be returned over the net.
"""

In [2]:
import nltk
import string
# 文本预处理
# step1. 对文本去掉换行符，分句、分词，再去掉标点
def get_tokens(text):
    text = text.replace('\n','')
    sents = nltk.sent_tokenize(text)  #分句
    tokens=[]
    for sent in sents:
        for word in nltk.word_tokenize(sent):
            if word not in string.punctuation:#过滤标点符号
                tokens.append(word)
    return tokens


In [3]:
from nltk.corpus import stopwords
from collections import Counter
# step2. 去除停用词，生成count字典，即每个单词出现次数
def make_count(text):
    tokens=get_tokens(text)
    filter=[w for w in tokens if not w in stopwords.words('english')]
    count=Counter(filter)
    return count

In [4]:
# 以text1为例生成count字典
print(make_count(text1))

Counter({'football': 12, 'rugby': 3, 'word': 2, 'known': 2, 'Football': 1, 'family': 1, 'team': 1, 'sports': 1, 'involve': 1, 'varying': 1, 'degrees': 1, 'kicking': 1, 'ball': 1, 'score': 1, 'goal': 1, 'Unqualified': 1, 'understood': 1, 'refer': 1, 'whichever': 1, 'form': 1, 'popular': 1, 'regional': 1, 'context': 1, 'appears': 1, 'Sports': 1, 'commonly': 1, 'called': 1, 'certain': 1, 'places': 1, 'include': 1, 'association': 1, 'soccer': 1, 'countries': 1, 'gridiron': 1, 'specifically': 1, 'American': 1, 'Canadian': 1, 'Australian': 1, 'rules': 1, 'either': 1, 'league': 1, 'union': 1, 'Gaelic': 1, 'These': 1, 'different': 1, 'variations': 1, 'codes': 1})


In [5]:
""" 手动实现TF-IDF """
import math

# 计算tf值
def tf(word, count):
    return count[word]/sum(count.values())

# 计算count_list有多少个文件包含word
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)

# 计算idf
def idf(word, count_list):
    return math.log2(len(count_list)/n_containing(word,count_list))

# 计算tf-idf
def tdidf(word,count,count_list):
    return tf(word,count)*idf(word,count_list)

# TF-IDF测试
count1,count2,count3=make_count(text1),make_count(text2),make_count(text3)
count_list=[count1,count2,count3]
print("Training by original algorithm......\n")
for i,count in enumerate(count_list):
    print("Top words in document %d"%(i + 1))
    scores={word: tdidf(word,count,count_list) for word in count}
    sorted_words=sorted(scores.items(),key=lambda x:x[1],reverse=True)
    for word,score in sorted_words[:5]:
        print("    Word: %s, TF-IDF: %s"%(word, round(score, 5)))

Training by original algorithm......

Top words in document 1
    Word: football, TF-IDF: 0.30677
    Word: rugby, TF-IDF: 0.07669
    Word: word, TF-IDF: 0.05113
    Word: known, TF-IDF: 0.05113
    Word: Football, TF-IDF: 0.02556
Top words in document 2
    Word: play, TF-IDF: 0.05283
    Word: one, TF-IDF: 0.03522
    Word: shooting, TF-IDF: 0.03522
    Word: inches, TF-IDF: 0.03522
    Word: cm, TF-IDF: 0.03522
Top words in document 3
    Word: net, TF-IDF: 0.10226
    Word: teammate, TF-IDF: 0.07669
    Word: bat, TF-IDF: 0.05113
    Word: back, TF-IDF: 0.05113
    Word: returned, TF-IDF: 0.05113


In [8]:
""" 使用Gensim模块 """
from nltk.corpus import stopwords
from gensim import corpora,models,matutils

#training by gensim's Ifidf Model
def get_words(text):
    tokens=get_tokens(text)
    filtered=[w for w in tokens if not w in stopwords.words('english')]
    return filtered

count1,count2,count3=get_words(text1),get_words(text2),get_words(text3)
countlist=[count1,count2,count3]
# training by TfidfModel in gensim
dictionary=corpora.Dictionary(countlist)
new_dict={v:k for k,v in dictionary.token2id.items()}
corpus2=[dictionary.doc2bow(count) for count in countlist]
tfidf2=models.TfidfModel(corpus2)
corpus_tfidf=tfidf2[corpus2]

# output
print("\nTraining by gensim Tfidf Model.......\n")
for i, doc in enumerate(corpus_tfidf):
    print("Top words in document %d"%(i + 1))
    sorted_words = sorted(doc, key=lambda x: x[1], reverse=True)    #type=list
    for num, score in sorted_words[:5]:
        print("    Word: %s, TF-IDF: %s"%(new_dict[num], round(score, 5)))


Training by gensim Tfidf Model.......

Top words in document 1
    Word: football, TF-IDF: 0.84766
    Word: rugby, TF-IDF: 0.21192
    Word: known, TF-IDF: 0.14128
    Word: word, TF-IDF: 0.14128
    Word: American, TF-IDF: 0.07064
Top words in document 2
    Word: play, TF-IDF: 0.29872
    Word: cm, TF-IDF: 0.19915
    Word: diameter, TF-IDF: 0.19915
    Word: end, TF-IDF: 0.19915
    Word: foul, TF-IDF: 0.19915
Top words in document 3
    Word: net, TF-IDF: 0.45775
    Word: teammate, TF-IDF: 0.34331
    Word: across, TF-IDF: 0.22888
    Word: back, TF-IDF: 0.22888
    Word: bat, TF-IDF: 0.22888


In [10]:
""" 可以观察到手动实现的和调包实现的有区别，区别在于gensim对得到的tf-idf向量做了规范化（normalize） """
import numpy as np

# 对向量做规范化, normalize
def unitvec(sorted_words):
    lst = [item[1] for item in sorted_words]
    L2Norm = math.sqrt(sum(np.array(lst)*np.array(lst)))
    unit_vector = [(item[0], item[1]/L2Norm) for item in sorted_words]
    return unit_vector

# TF-IDF测试
count1, count2, count3 = make_count(text1), make_count(text2), make_count(text3)
countlist = [count1, count2, count3]
print("Training by original algorithm......\n")
for i, count in enumerate(countlist):
    print("Top words in document %d"%(i + 1))
    scores = {word: tdidf(word, count, countlist) for word in count}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)    #type=list
    sorted_words = unitvec(sorted_words)   # normalize
    for word, score in sorted_words[:3]:
        print("    Word: %s, TF-IDF: %s"%(word, round(score, 5)))

Training by original algorithm......

Top words in document 1
    Word: football, TF-IDF: 0.84766
    Word: rugby, TF-IDF: 0.21192
    Word: word, TF-IDF: 0.14128
Top words in document 2
    Word: play, TF-IDF: 0.29872
    Word: one, TF-IDF: 0.19915
    Word: shooting, TF-IDF: 0.19915
Top words in document 3
    Word: net, TF-IDF: 0.45775
    Word: teammate, TF-IDF: 0.34331
    Word: bat, TF-IDF: 0.22888
