In [146]:
!pip install sklearn



In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("display.max_rows", 600)
from pathlib import Path  
import glob

In [148]:
directory_path = "."

In [149]:
text_files = sorted(glob.glob(f"{directory_path}/*.txt"))

In [150]:
text_files

['./week1_tokenized.txt',
 './week2_tokenized.txt',
 './week3_tokenized.txt',
 './week4_tokenized.txt',
 './week5_tokenized.txt',
 './week6_tokenized.txt',
 './week7_tokenized.txt',
 './week8_tokenized.txt']

In [151]:
text_titles = [Path(text).stem for text in text_files]

In [152]:
text_titles

['week1_tokenized',
 'week2_tokenized',
 'week3_tokenized',
 'week4_tokenized',
 'week5_tokenized',
 'week6_tokenized',
 'week7_tokenized',
 'week8_tokenized']

In [153]:
tfidf_vectorizer = TfidfVectorizer(input='filename')

In [154]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [155]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())

In [156]:
tfidf_df

Unnamed: 0,ªª,ª½,ªà,ªå,ªæ,ªç,ªè,ªñ,²²,²³,...,勇熊,勇猛,勇直,勇絕,勇退,勇野,勇闖,吆原,晉少,貫家
week1_tokenized,0.0,4.4e-05,0.0,6.6e-05,6.6e-05,4.4e-05,2.2e-05,0.0,0.0,2.2e-05,...,0.0,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
week2_tokenized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
week3_tokenized,2.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05,0.0,0.0,...,0.0,2.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
week4_tokenized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9e-06,0.0,1.6e-05,0.0,0.0,0.0,1.6e-05,1.6e-05
week5_tokenized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.1e-05,0.0,1.2e-05,0.0,0.0,0.0,0.0,2.1e-05,0.0,0.0
week6_tokenized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05,0.0,0.0,...,0.0,0.0,1.1e-05,2e-05,0.0,0.0,2e-05,0.0,0.0,0.0
week7_tokenized,0.0,0.0,1.9e-05,0.0,0.0,0.0,0.0,1.2e-05,3.8e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,1.9e-05,0.0,0.0,0.0,0.0
week8_tokenized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Add column for document frequency aka number of times word appears in all documents

In [142]:
tfidf_df1 = tfidf_df.stack().reset_index()

In [133]:
tfidf_df2 = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term'})

In [141]:
tfidf_df2

Unnamed: 0,document,term,tfidf
0,0,level_0,week1_tokenized
1,0,level_1,ªª
2,0,0,0.0
3,1,level_0,week1_tokenized
4,1,level_1,ª½
...,...,...,...
84700651,28233550,level_1,晉少
84700652,28233550,0,0.0
84700653,28233551,level_0,week8_tokenized
84700654,28233551,level_1,貫家


To find out the top 10 words with the highest tf–idf for every story, we're going to sort by document and tfidf score and then groupby document and take the first 10 values.

In [145]:
top_tfidf = tfidf_df2.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(200)

KeyError: 'document'

In [None]:
top_tfidf

We can zoom in on particular words and particular documents.

In [144]:
top_tfidf[top_tfidf['term'].str.contains('政府')]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

It turns out that the term "women" is very distinctive in Obama's Inaugural Address.

In [140]:
top_tfidf[top_tfidf['document'].str.contains('obama')]

AttributeError: Can only use .str accessor with string values!

## Visualize TF-IDF

In [98]:
!pip3 install altair



Let's make a heatmap that shows the highest TF-IDF scoring words for each president, and let's put a red dot next to two terms of interest: "war" and "peace":

The code below was contributed by [Eric Monson](https://github.com/emonson). Thanks, Eric!

In [138]:
top_tfidf = top_tfidf[top_tfidf["document"] != 'week9_tokenized']

In [139]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = []

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 6000)

TypeError: can only concatenate str (not "float") to str

In [109]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

mylist = text_files
df = pd.DataFrame({"texts": mylist})
tfidf_vectorizer = TfidfVectorizer(ngram_range=[1, 1])
tfidf_separate = tfidf_vectorizer.fit_transform(df["texts"])

df_tfidf = pd.DataFrame(
    tfidf_separate.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df.index
)
df_tfidf




Unnamed: 0,txt,week1_tokenized,week2_tokenized,week3_tokenized,week4_tokenized,week5_tokenized,week6_tokenized,week7_tokenized,week8_tokenized,week9_tokenized
0,0.357847,0.93378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.357847,0.0,0.93378,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.357847,0.0,0.0,0.93378,0.0,0.0,0.0,0.0,0.0,0.0
3,0.357847,0.0,0.0,0.0,0.93378,0.0,0.0,0.0,0.0,0.0
4,0.357847,0.0,0.0,0.0,0.0,0.93378,0.0,0.0,0.0,0.0
5,0.357847,0.0,0.0,0.0,0.0,0.0,0.93378,0.0,0.0,0.0
6,0.357847,0.0,0.0,0.0,0.0,0.0,0.0,0.93378,0.0,0.0
7,0.357847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93378,0.0
8,0.357847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93378
