## Calculate tf-idf scores using a collection of plain text (.txt) files

In [46]:
# Calculate tf-idf scores using a collection of plain text (.txt) files and the Python library scikit-learn, with a module called TfidfVectorizer.
# Tf-idf is a method that tries to identify the most distinctively frequent or significant words in a document.
# Calculate and normalize tf-idf scores for U.S. Inaugural Addresses with scikit-learn
# How distinctive is Obama’s inclusion of women in this address compared to all other U.S. Presidents? This is one of the questions that we’re going to try to answer with tf-idf.
# tf-idf = term_frequency * inverse_document_frequency
# We use scikit-learn’s TfidfVectorizer and CountVectorizer.
# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html#visualize-tf-idf

!pip install scikit-learn



In [9]:
# Importing the necessary libraries for tfid calculation.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("display.max_rows", 600)
from pathlib import Path
import glob

In [16]:
# The dataset was downloaded from Blackboard and uploaded to drive so that it could be accessed in colab env, and it was placed at the following path.
directory_path = "/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/"

In [17]:
# The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix.
text_files = glob.glob(f"{directory_path}/*.txt")

In [18]:
# Display and confirm that we can read the files.
text_files

['/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/01_washington_1789.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/02_washington_1793.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/19_lincoln_1861.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/22_grant_1873.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/27_cleveland_1893.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/30_roosevelt_theodore_1905.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_Addresses/33_wilson_1917.txt',
 '/content/drive/MyDrive/Colab Notebooks/Sem 2 - NLP Class/US_Inaugural_Addresses/US_Inaugural_

In [19]:
# Extracting the filenames from the file path.
text_titles = [Path(text).stem for text in text_files]

In [20]:
text_titles

['01_washington_1789',
 '02_washington_1793',
 '19_lincoln_1861',
 '22_grant_1873',
 '27_cleveland_1893',
 '30_roosevelt_theodore_1905',
 '33_wilson_1917',
 '06_madison_1809',
 '18_buchanan_1857',
 '03_adams_john_1797',
 '16_taylor_1849',
 '09_monroe_1821',
 '14_harrison_1841',
 '25_cleveland_1885',
 '15_polk_1845',
 '10_adams_john_quincy_1825',
 '28_mckinley_1897',
 '29_mckinley_1901',
 '13_van_buren_1837',
 '04_jefferson_1801',
 '21_grant_1869',
 '31_taft_1909',
 '17_pierce_1853',
 '12_jackson_1833',
 '26_harrison_1889',
 '24_garfield_1881',
 '08_monroe_1817',
 '23_hayes_1877',
 '05_jefferson_1805',
 '07_madison_1813',
 '11_jackson_1829',
 '20_lincoln_1865',
 '34_harding_1921',
 '32_wilson_1913',
 '52_clinton_1993',
 '49_reagan_1981',
 '42_eisenhower_1953',
 '35_coolidge_1925',
 '47_nixon_1973',
 '38_roosevelt_franklin_1937',
 '54_bush_george_w_2001',
 '36_hoover_1929',
 '43_eisenhower_1957',
 '37_roosevelt_franklin_1933',
 '51_bush_george_h_w_1989',
 '53_clinton_1997',
 '50_reagan_1

In [26]:
# Term frequency Inverse document frequency (TFIDF) is a statistical formula to convert text documents into vectors based on the relevancy of the word.
# It is based on the bag of the words model to create a matrix containing the information about less relevant and most relevant words in the document.
# Calculate tf–idf which Convert a collection of raw documents to a matrix of TF-IDF features.
# Initialize TfidfVectorizer with desired parameters.

tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')

In [27]:
# Running TfidfVectorizer on our text_files
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [29]:
# Creating a dataframe from resulting tf–idf vector, setting the “feature names” or words as columns and the titles as rows.
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names_out())

In [32]:
# Adding document frequency to calculate number of times word appears in all documents
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [33]:
# Create a Dataframe and pick only those words that we are interested in.
tfidf_slice = tfidf_df[['government', 'borders', 'people', 'obama', 'war', 'honor','foreign', 'men', 'women', 'children']]
tfidf_slice.sort_index().round(decimals=2)

Unnamed: 0,government,borders,people,obama,war,honor,foreign,men,women,children
00_Document Frequency,54.0,6.0,57.0,4.0,46.0,33.0,33.0,48.0,16.0,23.0
01_washington_1789,0.11,0.0,0.05,0.0,0.0,0.0,0.0,0.02,0.0,0.0
02_washington_1793,0.06,0.0,0.05,0.0,0.0,0.08,0.0,0.0,0.0,0.0
03_adams_john_1797,0.16,0.0,0.19,0.0,0.01,0.1,0.12,0.04,0.0,0.0
04_jefferson_1801,0.16,0.0,0.01,0.0,0.01,0.04,0.0,0.04,0.0,0.0
05_jefferson_1805,0.03,0.0,0.0,0.0,0.04,0.0,0.06,0.01,0.0,0.02
06_madison_1809,0.0,0.0,0.02,0.0,0.02,0.05,0.05,0.0,0.0,0.0
07_madison_1813,0.04,0.0,0.04,0.0,0.25,0.02,0.02,0.0,0.0,0.0
08_monroe_1817,0.17,0.0,0.11,0.0,0.09,0.01,0.1,0.04,0.0,0.0
09_monroe_1821,0.08,0.0,0.06,0.0,0.11,0.02,0.04,0.01,0.0,0.01


In [34]:
# Drop OO_Document Frequency since we just used them for our understanding about the data.
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')

In [35]:
# Reorganize the DataFrame so that the words are in rows rather than columns.
tfidf_df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,01_washington_1789,000,0.000000
1,01_washington_1789,03,0.000000
2,01_washington_1789,04,0.023259
3,01_washington_1789,05,0.000000
4,01_washington_1789,100,0.000000
...,...,...,...
521937,44_kennedy_1961,zachary,0.000000
521938,44_kennedy_1961,zeal,0.000000
521939,44_kennedy_1961,zealous,0.000000
521940,44_kennedy_1961,zealously,0.000000


In [36]:
tfidf_df = tfidf_df.stack().reset_index()

In [37]:
# Rename the columns with appropriate names.
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

In [38]:
# To find out the top 10 words with the highest tf–idf for every story, we’re going to sort by document and tfidf score and then groupby document and take the first 10 values
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

Unnamed: 0,document,term,tfidf
3707,01_washington_1789,government,0.113681
4108,01_washington_1789,immutable,0.103883
4175,01_washington_1789,impressions,0.103883
6337,01_washington_1789,providential,0.103883
5631,01_washington_1789,ought,0.103728
6351,01_washington_1789,public,0.103102
6117,01_washington_1789,present,0.097516
6389,01_washington_1789,qualifications,0.096372
5811,01_washington_1789,peculiarly,0.090546
653,01_washington_1789,article,0.085786


In [39]:
# Finding out the top 10 words with the highest tf–idf for every story.
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [40]:
# Zoom into a particular word 'women' to identify its Tfid in the documents.
top_tfidf[top_tfidf['term'].str.contains('women')]

Unnamed: 0,document,term,tfidf
503861,56_obama_2009,women,0.084859


In [41]:
# Zoom in on particular words 'obama 'and particular document.
top_tfidf[top_tfidf['document'].str.contains('obama')]

Unnamed: 0,document,term,tfidf
495406,56_obama_2009,america,0.148351
500298,56_obama_2009,nation,0.120229
500358,56_obama_2009,new,0.118002
503093,56_obama_2009,today,0.114792
498590,56_obama_2009,generation,0.100654
499762,56_obama_2009,let,0.0911
499578,56_obama_2009,jobs,0.090727
496911,56_obama_2009,crisis,0.087235
498779,56_obama_2009,hard,0.084859
503861,56_obama_2009,women,0.084859


In [42]:
# Zoom in on particular words 'trump' and particular document.
top_tfidf[top_tfidf['document'].str.contains('trump')]

Unnamed: 0,document,term,tfidf
477408,58_trump_2017,america,0.350162
479589,58_trump_2017,dreams,0.156436
477409,58_trump_2017,american,0.149226
481580,58_trump_2017,jobs,0.142766
483266,58_trump_2017,protected,0.132439
482413,58_trump_2017,obama,0.120288
482770,58_trump_2017,people,0.11237
485005,58_trump_2017,thank,0.109171
477993,58_trump_2017,borders,0.107075
485600,58_trump_2017,ve,0.107075


In [43]:
# Zoom in on particular words 'kennedy' and particular document.
top_tfidf[top_tfidf['document'].str.contains('kennedy')]

Unnamed: 0,document,term,tfidf
517760,44_kennedy_1961,let,0.267869
520292,44_kennedy_1961,sides,0.262849
518907,44_kennedy_1961,pledge,0.16096
513618,44_kennedy_1961,ask,0.107713
513850,44_kennedy_1961,begin,0.106495
514977,44_kennedy_1961,dare,0.106495
521881,44_kennedy_1961,world,0.10311
516299,44_kennedy_1961,final,0.102311
518356,44_kennedy_1961,new,0.0966
516106,44_kennedy_1961,explore,0.094223


In [44]:
# visualize our TF-IDF results with the data visualization library Altair.
!pip install altair



In [45]:
# Preparing a heatmap that shows the highest TF-IDF scoring words for each president, and we had added a red dot next to two terms of interest: "war" and "peace".
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)