# <b> Manual TF-IDF

In [1]:
import nltk

In [2]:
dataset = {
    "text-example_1.txt" : open("text-example-1.txt").read(),
    "text-example_2.txt" : open("text-example-2.txt").read(),
    "text-example_3.txt" : open("text-example-3.txt").read(),
    "text-example_4.txt" : open("text-example-4.txt").read(),
    "text-example_5.txt" : open("text-example-5.txt").read(),
    "text-example_6.txt" : open("text-example-6.txt").read(),
    "text-example_7.txt" : open("text-example-7.txt").read(),
    "text-example_8.txt" : open("text-example-8.txt").read(),
    "text-example_9.txt" : open("text-example-9.txt").read(),
    "text-example_10.txt" : open("text-example-10.txt").read()
}

In [3]:
dataset.keys()

['text-example_8.txt',
 'text-example_1.txt',
 'text-example_3.txt',
 'text-example_5.txt',
 'text-example_7.txt',
 'text-example_2.txt',
 'text-example_9.txt',
 'text-example_4.txt',
 'text-example_10.txt',
 'text-example_6.txt']

In [5]:
dataset["text-example_4.txt"]



In [6]:
# This is simply the number of times a word appears in any given document
# OBS. So this is no differnt than the "frequency distribution" method we have used in past lessons
# This function that looks on our dictionary of documents and returns a frequency distribution for any of our files.

def tf(dataset, file_name):
    text = dataset[file_name]
    tokens = nltk.word_tokenize(text)
    fd = nltk.FreqDist(tokens)
    return fd

In [7]:
# Aqui vemos o count para cada palavra do dataset "tf-idf_1". Teste.
tf(dataset, "text-example_4.txt")

FreqDist({',': 54, 'the': 44, 'and': 26, '.': 26, 'of': 25, 'in': 25, 'his': 14, 'Washington': 11, 'he': 10, 'to': 9, ...})

In [8]:
# This is simply the number of documents that contain a specific word out of all the documents we have
# So in this case we have 10 documents since we know our first document is about
from __future__ import division
import math
def idf(dataset, term):
    # count eh uma lista de true e false
    count = [term in dataset[file_name] for file_name in dataset]
    # the log of total numbers of documents we have over how many documents contain this term
    inv_df = math.log(len(count)/sum(count))
    return inv_df

In [9]:
#This is our score for the word "world across all our document"
idf(dataset, "world")

0.5108256237659907

In [10]:
# The last step is to look for the words with the highest scores.
from __future__ import division
def tfidf(dataset, filename, n):
    term_scores = {}
    file_id = tf(dataset,filename)
    for term in file_id:
        if term.isalpha():
            idf_value = idf(dataset, term)
            tf_value = tf(dataset, filename)[term]
            tfidf_value = tf_value * idf_value
            # We are just rounding off the number here.  We are storing each score in the dictionary.
            term_scores[term] = round(tfidf_value,2)
    # We are sorting by the values to get the highest scores.  
    #Yes, "n" is just looking at the index to help up get back a specific number.
    return sorted(term_scores.items(), key=lambda x:-x[1])[:n]  

# OBS. Here, sorted(), takes the dictionary items and then wants to know how to sort them.  
# The x in the lambda function represents the values. 
# The dictionary items look like tuples (key, value) when we call the .items() method. 
# So x[1] refers to the value in that tuple.  Placing the "-" is reversing the order

In [12]:
# These were the top ten scoring word determined by IDF
# We can take a look at the words that are coming back and start to get a sense of what this document might be about
tfidf(dataset,"text-example_4.txt",10)

[('Washington', 25.33),
 ('President', 6.44),
 ('Continental', 4.82),
 ('preservation', 4.61),
 ('federal', 4.61),
 ('militia', 4.61),
 ('opposition', 4.61),
 ('generals', 4.61),
 ('presided', 4.61),
 ('armies', 4.61)]

In [13]:
# For our next step is to simply run through a for loop for every single document and just call
# our TF-IDF function
for file_name in dataset:
    print("{0}: \n {1} \n".format(file_name, tfidf(dataset,file_name,5)))

text-example_8.txt: 
 [('Titanic', 18.42), ('passengers', 11.51), ('maritime', 9.21), ('safety', 9.21), ('aboard', 9.21)] 

text-example_1.txt: 
 [('Soviet', 20.72), ('Union', 18.42), ('Axis', 16.12), ('Japan', 11.27), ('Germany', 11.27)] 

text-example_3.txt: 
 [('Napoleon', 32.19), ('French', 16.86), ('Coalition', 11.51), ('Prussia', 6.91), ('military', 6.02)] 

text-example_5.txt: 
 [('Newton', 23.03), ('scientists', 6.91), ('motion', 4.83), ('developed', 4.61), ('Trinity', 4.61)] 

text-example_7.txt: 
 [('Leonardo', 18.42), ('Vinci', 9.21), ('painting', 6.91), ('Renaissance', 4.61), ('inventions', 4.61)] 

text-example_2.txt: 
 [('Module', 16.12), ('lunar', 13.82), ('Armstrong', 13.82), ('Apollo', 11.51), ('Moon', 9.21)] 

text-example_9.txt: 
 [('Rockefeller', 23.03), ('Standard', 6.91), ('business', 6.91), ('Oil', 6.91), ('University', 4.83)] 

text-example_4.txt: 
 [('Washington', 25.33), ('President', 6.44), ('Continental', 4.82), ('preservation', 4.61), ('federal', 4.61)] 

t