<a href="https://colab.research.google.com/github/andresvc21/TFIDFVectorizer/blob/main/Natural_Language_Processing_(NLP)_example_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Natural Language Processing (NLP) example with TF IDF


##BagOfWords

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

print(bagOfWordsA)
print(bagOfWordsB)

['the', 'man', 'went', 'out', 'for', 'a', 'walk']
['the', 'children', 'sat', 'around', 'the', 'fire']


In [3]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print(uniqueWords)

{'went', 'the', 'out', 'a', 'fire', 'for', 'man', 'around', 'walk', 'sat', 'children'}


In [5]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

print(numOfWordsA)
print(numOfWordsB)

{'went': 1, 'the': 1, 'out': 1, 'a': 1, 'fire': 0, 'for': 1, 'man': 1, 'around': 0, 'walk': 1, 'sat': 0, 'children': 0}
{'went': 0, 'the': 2, 'out': 0, 'a': 0, 'fire': 1, 'for': 0, 'man': 0, 'around': 1, 'walk': 0, 'sat': 1, 'children': 1}


##Stop Words

In [8]:

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

##IDF

In [10]:
##Term Frequency (TF)

def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [13]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'went': 0.6931471805599453, 'the': 0.0, 'out': 0.6931471805599453, 'a': 0.6931471805599453, 'fire': 0.6931471805599453, 'for': 0.6931471805599453, 'man': 0.6931471805599453, 'around': 0.6931471805599453, 'walk': 0.6931471805599453, 'sat': 0.6931471805599453, 'children': 0.6931471805599453}


##TF-IDF

In [14]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [17]:
tfidfA = computeTFIDF(numOfWordsA, idfs)
tfidfB = computeTFIDF(numOfWordsB, idfs)

df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,went,the,out,a,fire,for,man,around,walk,sat,children
0,0.693147,0.0,0.693147,0.693147,0.0,0.693147,0.693147,0.0,0.693147,0.0,0.0
1,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.693147,0.0,0.693147,0.693147


##Class in Skleanr

In [19]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df



Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
