In [1]:
!pip install sklearn



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("display.max_rows", 600)
from pathlib import Path  
import glob

In [5]:
directory_path = "."

In [6]:
text_files = sorted(glob.glob(f"{directory_path}/*.txt"))

In [7]:
text_files

['./week1_tokenized.txt',
 './week2_tokenized.txt',
 './week3_tokenized.txt',
 './week4_tokenized.txt',
 './week5_tokenized.txt',
 './week6_tokenized.txt',
 './week7_tokenized.txt',
 './week8_tokenized.txt']

In [8]:
text_titles = [Path(text).stem for text in text_files]

In [9]:
text_titles

['week1_tokenized',
 'week2_tokenized',
 'week3_tokenized',
 'week4_tokenized',
 'week5_tokenized',
 'week6_tokenized',
 'week7_tokenized',
 'week8_tokenized']

## Calculate tf–idf

In [80]:
tfidf_vectorizer = TfidfVectorizer(input='filename')

Run TfidfVectorizer on our `text_files`

In [81]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

Make a DataFrame out of the resulting tf–idf vector, setting the "feature names" or words as columns and the titles as rows

In [82]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())

Add column for document frequency aka number of times word appears in all documents

In [83]:
tfidf_df.loc['Document Frequency'] = (tfidf_df > 0).sum()

In [90]:
tfidf_slice = tfidf_df[['政府', '国家','美国','香港']]
tfidf_slice.sort_index().round(decimals=2)

Unnamed: 0,政府,国家,美国,香港
Document Frequency,8.0,8.0,8.0,8.0
week1_tokenized,0.05,0.04,0.07,0.07
week2_tokenized,0.04,0.04,0.06,0.07
week3_tokenized,0.04,0.05,0.07,0.07
week4_tokenized,0.01,0.02,0.03,0.03
week5_tokenized,0.07,0.06,0.08,0.13
week6_tokenized,0.05,0.05,0.09,0.08
week7_tokenized,0.03,0.04,0.1,0.05
week8_tokenized,0.05,0.12,0.07,0.06


In [29]:
tfidf_df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,week1_tokenized,ªª,0.000000
1,week1_tokenized,ª½,0.000044
2,week1_tokenized,ªà,0.000000
3,week1_tokenized,ªå,0.000066
4,week1_tokenized,ªæ,0.000066
...,...,...,...
28233547,week8_tokenized,勇野,0.000000
28233548,week8_tokenized,勇闖,0.000000
28233549,week8_tokenized,吆原,0.000000
28233550,week8_tokenized,晉少,0.000000


In [30]:
tfidf_df = tfidf_df.stack().reset_index()

In [31]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

To find out the top 200 words with the highest tf–idf for every story, we're going to sort by document and tfidf score and then groupby document and take the first 200 values.

In [66]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(50)

In [67]:
top_tfidf[:10]

Unnamed: 0,document,term,tfidf
1193410,week1_tokenized,微博,0.253599
664382,week1_tokenized,喜欢,0.243759
2296261,week1_tokenized,知道,0.212054
1633307,week1_tokenized,朋友,0.162581
289640,week1_tokenized,关注,0.161368
2166962,week1_tokenized,生活,0.160896
34493,week1_tokenized,世界,0.157142
3037166,week1_tokenized,轉發,0.147022
916969,week1_tokenized,孩子,0.140884
3250493,week1_tokenized,问题,0.125499


We can zoom in on particular words and particular documents.

In [72]:
top_tfidf[top_tfidf['term'].str.contains('政府')]

Unnamed: 0,document,term,tfidf
15630598,week5_tokenized,政府,0.069408
26218180,week8_tokenized,政府,0.054138


In [60]:
top_tfidf[top_tfidf['term'].str.contains('国家')]

Unnamed: 0,document,term,tfidf
705092,week1_tokenized,国家,0.041265
4234286,week2_tokenized,国家,0.043882
7763480,week3_tokenized,国家,0.053949
14821868,week5_tokenized,国家,0.0596
18351062,week6_tokenized,国家,0.053793
21880256,week7_tokenized,国家,0.041539
25409450,week8_tokenized,国家,0.122977


In [77]:
top_tfidf[top_tfidf['term'].str.contains('美国')]

Unnamed: 0,document,term,tfidf
2534322,week1_tokenized,美国,0.067827
6063516,week2_tokenized,美国,0.062304
9592710,week3_tokenized,美国,0.074176
16651098,week5_tokenized,美国,0.07786
20180292,week6_tokenized,美国,0.091771
23709486,week7_tokenized,美国,0.096565
27238680,week8_tokenized,美国,0.073824


In [74]:
top_tfidf[top_tfidf['term'].str.contains('台湾')]

Unnamed: 0,document,term,tfidf
7607096,week3_tokenized,台湾,0.054601


In [75]:
top_tfidf[top_tfidf['term'].str.contains('香港')]

Unnamed: 0,document,term,tfidf
3405417,week1_tokenized,香港,0.068351
6934611,week2_tokenized,香港,0.072253
10463805,week3_tokenized,香港,0.074415
17522193,week5_tokenized,香港,0.127512
21051387,week6_tokenized,香港,0.079572
24580581,week7_tokenized,香港,0.05075
28109775,week8_tokenized,香港,0.064576


## Visualize TF-IDF

In [None]:
!pip install altair

Let's make a heatmap that shows the highest TF-IDF scoring words for each president, and let's put a red dot next to two terms of interest: "war" and "peace":

The code below was contributed by [Eric Monson](https://github.com/emonson). Thanks, Eric!

In [70]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['国家', '政府','美国','台湾','香港']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 2000)