# TF-IDF example


## Import libraries 


In [0]:
import pandas as pd  
from sklearn.feature_extraction.text import TfidfVectorizer

## Document initilization
Here we just initialize our  document 


In [0]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

## Converting documents to bag of words. 

#### ML will not understand  raw text.  So we have to convert raw test into vectors of numbers.  
####  In NLP we will extraxting the features from a text, we will keep keep all the words in a document to  a bucket.  This is called Bag Of Words or BoW. 

#### Please note that any info any information about the structure of the sentence is lost.
#### Also bag of words doesn’t account for noise. That means cetain words are used for formulate sentence but will not add any sematic meaning ( EX: the )


In [0]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

## Converting bag of words to a set for avoiding duplicates

In [28]:
UniqueWords = set(bagOfWordsA).union(set(babOfWordsB))
UniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

## Creating dictionary of words and their occurance for each document in the corpus

In [0]:
# Initializing all the keys  with 0 count
#Ex:  {'fire': 0, 'for': 0, 'the': 0, 'man': 0, 'walk': 0, 'children': 0, 'sat': 0, 'went': 0, 'out': 0, 'around': 0, 'a': 0}
numOfWordsA = dict.fromkeys(UniqueWords, 0)
for words in bagOfWordsA:
  numOfWordsA[words]+=1

numOfWordsB = dict.fromkeys(UniqueWords, 0)
for words in bagOfWordsB:
  numOfWordsB[words]+=1

## Removing stop words


In [0]:
## we can remove stop words if required.  
from nltk.corpus  import stopwords
#stopwords.words('english')

## Term Frequency (TF )
#### The number of times a word apprear in a document  divided by the the total words in a document.  


![alt text](https://miro.medium.com/proxy/1*HM0Vcdrx2RApOyjp_ZeW_Q.png)


In [0]:
def findTF(numOfWords, bagOfWords):
  result = {} 
  totalWords  = len(bagOfWords)
  for word, count in numOfWords.items():
    result[word] = count/totalWords
  return result


## Term frequency of each document

In [0]:
tfA  = findTF(numOfWordsA, bagOfWordsA)
tfB = findTF(numOfWordsB, bagOfWordsB)

## Inverse Data Frequency (IDF)
#### log of number of documents divided by the document contains certain a particular  words  

![alt text](https://miro.medium.com/proxy/1*A5YGwFpcTd0YTCdgoiHFUw.png)


In [0]:
def computeIDF(documents):
  import math
  N  = len(documents)

  idfDict = dict.fromkeys(documents[0].keys(), 0)
  for document  in documents: 
    for word, count in document.items():
      if count > 0:
        idfDict[word]+=1
  for word, count in idfDict.items():
    idfDict[word] = math.log(N/count)
  return idfDict

In [0]:
  idfs  = computeIDF([numOfWordsA, numOfWordsB])

## TF-IDF is simply the TF multiplied by IDF
![alt text](https://miro.medium.com/proxy/1*nSqHXwOIJ2fa_EFLTh5KYw.png)

In [0]:
def computeTf_Idf(tfs, idfs):
  tfidf = {}
  for word, val  in tfs.items():
    tfidf[word] = val * idfs[word]
  return tfidf

## Tf-Idf individual doc

In [55]:
tfidfA = computeTf_Idf(tfA, idfs)
tfidfB= computeTf_Idf(tfB, idfs)

df = pd.DataFrame([tfidfA, tfidfB])
df.head()

Unnamed: 0,fire,for,the,man,walk,children,sat,went,out,around,a
0,0.0,0.099021,0.0,0.099021,0.099021,0.0,0.0,0.099021,0.099021,0.0,0.099021
1,0.115525,0.0,0.0,0.0,0.0,0.115525,0.115525,0.0,0.0,0.115525,0.0


# We can do the same with sklearn  TfidfVectorizer 

##### The values differ slightly because sklearn uses a smoothed version idf and various other little optimizations

In [61]:
Vectorizer = TfidfVectorizer()

dfidf_model  = Vectorizer.fit_transform([documentA , documentB])

featur_name = Vectorizer.get_feature_names()
dense = dfidf_model.todense()

tfidfs_list = dense.tolist()
df = pd.DataFrame(tfidfs_list, columns = featur_name)
df.head()

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
