In [None]:
'''
Important Concepts under NLP
1.tf-idf 
2.N-grams
3.Stemming
4.Lemmatisation
5.Cosine Similarity
6.Bag of words
7.Word2Vec
8.LDA
9.Edit Distance

Exploring each of these concepts on a high level

'''

In [None]:
##### TF-IDF(TERM FREQUENCY-INVERSE DOCUMENT FREQUENCY) #########

'''
All about tf-idf:
Stands for Term Frequency and inverse document frequency. It relates to the fact that if the word frequency increases
its weight increases(the more times a term occurs in a document).It contains an offset which helps it to distinguish 
between important words and really common words like 'a','the'.

Roughly,tf-idf(t,D)= tf(t,d).idf(t,D) where
   tf(t,D)= f(t/k) (number of times a single words appear in the document/total number of words in document)
   idf(t,D)=log(total number of documents(N)/total number of document that contain the specific word)
   
   
   say for example docA= the sky is blue , docB=the sky is not blue (for this corpus intuitively our important word 
   diffrentiating both the documents is 'not',so we perform calculation to prove this)
   
             tf-score                     idf-score          MultipliedFinalScore(tf.idf)
              A  |   B                 A     |      B         
    the       1      1               log(2/2)   log(2/2)              0
    sky       1      1                ----do------                  --do--
    is        1      1
    blue      1      1
    not       0      1               log(2/1)   log(2/1)           log(2)(More Important!!)   
    
    NAIVE APPROACH--
    -->Each document is considered as a bag of words(BOW) using bowa=docA.split(""),bowb=docB.split('').
    -->Create a set of the words of each document to remove duplicates.set(bowa).union(set(bowb))=>wordset
    -->create dictionary to keep the word count
        forA= dict.fromkeys(wordset,0)   initilaize with 0
        forB= dict.fromkeys(wordset,0)
        
        count the words and increament
        for word in bowA:
        forA[word]+=1
    -->convert the lists into a dataframe to perform operations
    
    TF_IDF APPROACH-
    -->matrix formed above contians equal points for pretty common words too 'the','a'
    -->

'''

In [16]:
#implementing the naive approach
doc1="the sky is blue"
doc2="the sky is not blue"

wordl1=doc1.split(' ')
wordl2=doc2.split(' ')
print(wordl1,wordl2)

['the', 'sky', 'is', 'blue'] ['the', 'sky', 'is', 'not', 'blue']


In [17]:
common=set(wordl1).union(set(wordl2))
common

{'blue', 'is', 'not', 'sky', 'the'}

In [19]:
assign1=dict.fromkeys(common,0)
assign2=dict.fromkeys(common,0)
assign1

{'not': 0, 'blue': 0, 'the': 0, 'sky': 0, 'is': 0}

In [20]:
for i in wordl1:
    assign1[i]+=1 
for i in wordl2:
    assign2[i]+=1
    
assign1

{'not': 0, 'blue': 1, 'the': 1, 'sky': 1, 'is': 1}

In [24]:
import pandas as pd
df=pd.DataFrame([assign1,assign2])
df

Unnamed: 0,blue,is,not,sky,the
0,1,1,0,1,1
1,1,1,1,1,1


In [30]:
#TF-IDF STRATEGY IMPLEMENTATION
def computeTF(worddict,bow):
    tfdict={}
    bowcount=len(bow)
    for word,count in worddict.items():
        tfdict[word]=count/float(bowcount)
        
    return tfdict  

tfbow1=computeTF(assign1,wordl1)
tfbow2=computeTF(assign2,wordl2)
tfbow1

{'not': 0.0, 'blue': 0.25, 'the': 0.25, 'sky': 0.25, 'is': 0.25}

In [33]:
def computeIDF(doclist):
    import math
    idfdict={}
    N=len(doclist)
    
    #count the number of documents that contain a word w
    idfdict=dict.fromkeys(doclist[0].keys(),0)
    for doc in doclist:
        for word,val in doc.items():
            if val>0:
                idfdict[word]+=1
    for word,val in idfdict.items():
        idfdict[word]=math.log(N/float(val))
        
    return idfdict      

idfs=computeIDF([assign1,assign2])
idfs

{'not': 0.6931471805599453, 'blue': 0.0, 'the': 0.0, 'sky': 0.0, 'is': 0.0}

In [35]:
def computeTFIDF(tfbow,idfs):
    tfidf={}
    for word,val in tfbow.items():
        tfidf[word]=val*idfs[word]
    return tfidf

tfidfbow1=computeTFIDF(tfbow1,idfs)
tfidfbow2=computeTFIDF(tfbow2,idfs)      

{'not': 0.0, 'blue': 0.0, 'the': 0.0, 'sky': 0.0, 'is': 0.0}

In [36]:
import pandas as pd
pd.DataFrame([tfidfbow1,tfidfbow2])

Unnamed: 0,blue,is,not,sky,the
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.138629,0.0,0.0
