# TF-IDF for 9/11 Report and Speech Compared to General Corpus

In [1]:
# Load in filtered speech and report tokens
%store -r commission_nohtml_filtered
%store -r bush_nohtml_filtered

In [2]:
# Load in Moby Dick from Gutenberg corpus
import nltk
moby = nltk.corpus.gutenberg.words('melville-moby_dick.txt')

## Clean Moby Dick file

In [3]:
# Load in stop words and regex for substitution
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import re

In [4]:
# Remove stop words, grammar, make lowercase
moby_filtered = []
 
for w in moby:
    w = re.sub(r'[^\w\s]','',w)
    if w not in stop_words:
        moby_filtered.append(w.lower())

In [5]:
# Remove empty strings
while("" in moby_filtered) :
    moby_filtered.remove("")

In [6]:
# Import tokenizer
from nltk import word_tokenize

In [7]:
# Tokenize
moby_tokens = []

for w in moby_filtered:
    moby_tokens.append(word_tokenize(str(w)))

In [8]:
# Import frequency distribution
from nltk.probability import FreqDist

In [9]:
# Calculate word frequencies and print top 20
moby_fdist = FreqDist(moby_filtered)
moby_fdist.most_common(20)

[('i', 2124),
 ('whale', 1226),
 ('one', 921),
 ('the', 710),
 ('but', 705),
 ('like', 647),
 ('upon', 566),
 ('man', 527),
 ('ship', 518),
 ('ahab', 511),
 ('ye', 472),
 ('sea', 455),
 ('old', 450),
 ('would', 432),
 ('and', 406),
 ('though', 384),
 ('head', 345),
 ('yet', 345),
 ('boat', 336),
 ('time', 334)]

In [10]:
# Import gensim for dictionary
import gensim

In [11]:
# Create dictionary and BOW
dictionary_moby = gensim.corpora.Dictionary(moby_tokens)
bow_moby = [dictionary_moby.doc2bow(w) for w in moby_tokens]

## TF-IDF of Bush Speech

In [12]:
# Import SKlearn for TFIDF
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Perform TF-IDF 
tfidf = TfidfVectorizer()
vecs = tfidf.fit_transform([str(bush_nohtml_filtered), str(moby_filtered)])

In [15]:
# Print correlation matrix
corr_matrix = ((vecs * vecs.T).A)
corr_matrix

array([[1.        , 0.15675144],
       [0.15675144, 1.        ]])

In [16]:
# Import collections and pandas for data frame
from collections import Counter
import pandas as pd

In [20]:
# Create data frame of TF-IDF vocabulary
columns = [k for (v, k) in sorted((v, k)
           for k, v in tfidf.vocabulary_.items())]
tfidfs = pd.DataFrame(vecs.todense(),
                      columns=columns)

In [21]:
# Sort by most frequent word and display
tfidfs = tfidfs.transpose()
bush_topwords = tfidfs.sort_values(0,ascending=False)
bush_topwords.head()

Unnamed: 0,0,1
applause,0.448445,0.0
terrorists,0.201027,0.0
tonight,0.201027,0.0
world,0.187042,0.038269
america,0.17604,0.002609


In [22]:
# Calculate top 50
bush_top50 = bush_topwords.index[:50]

In [23]:
# Calculate top 25
bush_top25 = bush_topwords.index[:25]

In [24]:
# Store top 50 words and top 25 words of Bush speech
%store bush_top50
%store bush_top25

Stored 'bush_top50' (Index)
Stored 'bush_top25' (Index)


## TF-IDF of Report

In [27]:
# Perform TF-IDF
vecs_commis = tfidf.fit_transform([str(commission_nohtml_filtered), str(moby_filtered)])

In [28]:
# Print correlation matrix
corr_matrix = ((vecs_commis * vecs_commis.T).A)
corr_matrix

array([[1.        , 0.13346274],
       [0.13346274, 1.        ]])

In [29]:
# Create data frame
columns = [k for (v, k) in sorted((v, k)
           for k, v in tfidf.vocabulary_.items())]
tfidfs = pd.DataFrame(vecs_commis.todense(),
                      columns=columns)

In [30]:
# Sort by most frequent word and display
tfidfs = tfidfs.transpose()
commis_topwords = tfidfs.sort_values(0,ascending=False)
commis_topwords.head()

Unnamed: 0,0,1
center,0.360955,0.0
aircraft,0.354585,0.0
faa,0.286641,0.0
flight,0.211501,0.002062
american,0.190351,0.007789


In [31]:
# Calculate top 50
commis_top50 = commis_topwords.index[:50]

In [32]:
# Calculate top 25
commis_top25 = commis_topwords.index[:25]

In [33]:
# Store top 50 words and top 25 words of report
%store commis_top50
%store commis_top25

Stored 'commis_top50' (Index)
Stored 'commis_top25' (Index)
