# TF IDF

In [85]:
import pandas as pd

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
docA = 'the man want out for walk'
docB = 'the childern sat around the fire'

In [88]:
docA

'the man want out for walk'

In [89]:
docB

'the childern sat around the fire'

In [90]:
bagOfWordsA = docA.split(' ')
bagOfWordsB = docB.split(' ')

In [96]:
bagOfWordsA

['the', 'man', 'want', 'out', 'for', 'walk']

In [97]:
len(bagOfWordsA)

6

In [98]:
bagOfWordsB

['the', 'childern', 'sat', 'around', 'the', 'fire']

In [99]:
uniquewords = set(bagOfWordsA).union(set(bagOfWordsB))

In [100]:
print(uniquewords)

{'out', 'childern', 'sat', 'walk', 'around', 'fire', 'want', 'for', 'man', 'the'}


In [101]:
numberOfWordsA = dict.fromkeys(uniquewords , 0)
'''The dict.fromKeys() is a built-in Python function that creates a new dictionary from the given sequence of
   elements with a value provided by the user.The fromKeys method returns the dictionary with specified keys and values.'''
for word in bagOfWordsA:
    numberOfWordsA[word] += 1
    """it's mean that add 1 to every of numberOfWordsA then make a proper dictionary"""
    
numberOfWordsB = dict.fromkeys(uniquewords , 0)
for word in bagOfWordsB:
    numberOfWordsB[word] += 1

In [102]:
len(numberOfWordsA)

10

In [103]:
len(numberOfWordsB)

10

In [22]:
print(numberOfWordsA,'\n',numberOfWordsB)

{'out': 1, 'childern': 0, 'sat': 0, 'walk': 1, 'around': 0, 'fire': 0, 'want': 1, 'for': 1, 'man': 1, 'the': 1} 
 {'out': 0, 'childern': 1, 'sat': 1, 'walk': 0, 'around': 1, 'fire': 1, 'want': 0, 'for': 0, 'man': 0, 'the': 2}


In [70]:
from nltk.corpus import stopwords
"""The nltk.corpus package defines a collection of corpus reader classes, 
which can be used to access the contents of a diverse set of corpora"""
#stopwords.words('english')

# TERM FREQUENCE (TF)

Term Frequency (tf): gives us the frequency of the word in each document in the corpus. It is the ratio of number of times the word appears in a document compared to the total number of words in that document. It increases as the number of occurrences of that word within the document increases. Each document has its own tf.

# formula
tf(t,d) = count of t in d / number of words in d

In [120]:
def computeTF (wordsDict,bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word,count in wordsDict.items():
        '''this is key which is [word] = this is value of dictionary which count'''
        tfDict[word] = count/float(bagOfWordsCount)
        """it save word dict but 1st it find the word how many times occure 
            than it put on place of count 
            than it divide that by bagOfWordsCount
            in this case which is 6"""
    return tfDict

In [117]:
numberOfWordsA

{'out': 1,
 'childern': 0,
 'sat': 0,
 'walk': 1,
 'around': 0,
 'fire': 0,
 'want': 1,
 'for': 1,
 'man': 1,
 'the': 1}

In [118]:
len(numberOfWordsA)

10

In [111]:
bagOfWordsA

['the', 'man', 'want', 'out', 'for', 'walk']

In [112]:
len(bagOfWordsA)

6

In [110]:
#numberOfWordsA = 10 , bagOfWordsA = 6
tfA = computeTF(wordsDict = numberOfWordsA , bagOfWords = bagOfWordsA)

In [108]:
tfA

{'out': 0.16666666666666666,
 'childern': 0.0,
 'sat': 0.0,
 'walk': 0.16666666666666666,
 'around': 0.0,
 'fire': 0.0,
 'want': 0.16666666666666666,
 'for': 0.16666666666666666,
 'man': 0.16666666666666666,
 'the': 0.16666666666666666}

In [None]:
tfB = computeTF(wordsDict = numberOfWordsB , bagOfWords = bagOfWordsB)

In [115]:
numberOfWordsB

{'out': 0,
 'childern': 1,
 'sat': 1,
 'walk': 0,
 'around': 1,
 'fire': 1,
 'want': 0,
 'for': 0,
 'man': 0,
 'the': 2}

In [116]:
len(numberOfWordsB)

10

In [113]:
bagOfWordsB

['the', 'childern', 'sat', 'around', 'the', 'fire']

In [114]:
len(bagOfWordsB)

6

In [109]:
tfB

{'out': 0.0,
 'childern': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'walk': 0.0,
 'around': 0.16666666666666666,
 'fire': 0.16666666666666666,
 'want': 0.0,
 'for': 0.0,
 'man': 0.0,
 'the': 0.3333333333333333}

# INVERSE DATA FREQUENCE (IDF)

Inverse Data Frequency (idf): used to calculate the weight of rare words across all documents in the corpus.
The words that occur rarely in the corpus have a high IDF score. It is given by the equation below.

# Formula :
idf(t) = log(N/(df + 1))

In [122]:
def computeIDF(documents):
    import math 
    '''The (math) module is a standard module in Python and is always available. To use mathematical functions '''
    N = len(documents)
    #here it automatically create dictionary 
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word , val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word , val in idfDict.items():
        #this is formula of idf
        idfDict[word]=math.log(N/float(val))
    return idfDict

In [128]:
idfs = computeIDF(documents=[numberOfWordsA,numberOfWordsB])
'''here i passed a list in computeIDF'''

'here i passed a list in computeIDF'

In [129]:
len([numberOfWordsA,numberOfWordsB])

2

In [124]:
len(numberOfWordsA)

10

In [125]:
idfs

{'out': 0.6931471805599453,
 'childern': 0.6931471805599453,
 'sat': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'around': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'want': 0.6931471805599453,
 'for': 0.6931471805599453,
 'man': 0.6931471805599453,
 'the': 0.0}

# Formula :
tf-idf(t, d) = tf(t, d) * log(N/(df + 1))

In [83]:
def computeTF_IDF(tfbagOfWords,idfs):
    tfidf = {}
    for word , val in tfbagOfWords.items():
        #this is formula of tf_idf
        tfidf[word] = val*idfs[word]
    return tfidf

In [130]:
tfidfA = computeTF_IDF(tfA,idfs)

In [131]:
tfidfA

{'out': 0.11552453009332421,
 'childern': 0.0,
 'sat': 0.0,
 'walk': 0.11552453009332421,
 'around': 0.0,
 'fire': 0.0,
 'want': 0.11552453009332421,
 'for': 0.11552453009332421,
 'man': 0.11552453009332421,
 'the': 0.0}

In [None]:
tfidfB = computeTF_IDF(tfB,idfs)

In [132]:
tfidfB

{'out': 0.0,
 'childern': 0.11552453009332421,
 'sat': 0.11552453009332421,
 'walk': 0.0,
 'around': 0.11552453009332421,
 'fire': 0.11552453009332421,
 'want': 0.0,
 'for': 0.0,
 'man': 0.0,
 'the': 0.0}

In [133]:
df = pd.DataFrame(tfidfA,tfidfB)

In [134]:
df

Unnamed: 0,out,childern,sat,walk,around,fire,want,for,man,the
out,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
childern,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
sat,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
walk,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
around,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
fire,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
want,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
for,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
man,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
the,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0


In [135]:
df = pd.DataFrame([tfidfA,tfidfB])

In [136]:
df

Unnamed: 0,out,childern,sat,walk,around,fire,want,for,man,the
0,0.115525,0.0,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.115525,0.0
1,0.0,0.115525,0.115525,0.0,0.115525,0.115525,0.0,0.0,0.0,0.0


In [137]:
vectorizer = TfidfVectorizer()
'''The process of transforming text into a numerical feature is called text vectorization.
    TF-IDF is one of the most popular text vectorizers,
    the calculation is very simple and easy to understand. 
    It gives the rare term high weight and gives the common term low weight.'''

'The process of transforming text into a numerical feature is called text vectorization.\n    TF-IDF is one of the most popular text vectorizers,\n    the calculation is very simple and easy to understand. \n    It gives the rare term high weight and gives the common term low weight.'

In [138]:
vectors = vectorizer.fit_transform([docA,docB])
'''This fit_transform() method is basically the combination of fit method and transform method,
    it is equivalent to fit(). transform().
    This method performs fit and transform on the input data at a single time and converts the data points'''

In [141]:
print(vectors)

  (0, 8)	0.42615959880289433
  (0, 3)	0.42615959880289433
  (0, 5)	0.42615959880289433
  (0, 9)	0.42615959880289433
  (0, 4)	0.42615959880289433
  (0, 7)	0.3032160644503863
  (1, 2)	0.40740123733358447
  (1, 0)	0.40740123733358447
  (1, 6)	0.40740123733358447
  (1, 1)	0.40740123733358447
  (1, 7)	0.5797386715376657


In [142]:
feature_names = vectorizer.get_feature_names()

In [144]:
print(feature_names)

['around', 'childern', 'fire', 'for', 'man', 'out', 'sat', 'the', 'walk', 'want']


In [149]:
dense = vectors.todense()
'''to_dense() function has returned the dense representation of the given series object.
    It has allocated memory to store even the missing values in the Series.
    Dense representation is not memory efficient when lots of data is missing.'''

'to_dense() function has returned the dense representation of the given series object.\n    It has allocated memory to store even the missing values in the Series.\n    Dense representation is not memory efficient when lots of data is missing.'

In [147]:
print(dense)

[[0.         0.         0.         0.4261596  0.4261596  0.4261596
  0.         0.30321606 0.4261596  0.4261596 ]
 [0.40740124 0.40740124 0.40740124 0.         0.         0.
  0.40740124 0.57973867 0.         0.        ]]


In [150]:
#it simple converting dense out into list
denselist = dense.tolist()

In [148]:
print(denselist)

[[0.0, 0.0, 0.0, 0.42615959880289433, 0.42615959880289433, 0.42615959880289433, 0.0, 0.3032160644503863, 0.42615959880289433, 0.42615959880289433], [0.40740123733358447, 0.40740123733358447, 0.40740123733358447, 0.0, 0.0, 0.0, 0.40740123733358447, 0.5797386715376657, 0.0, 0.0]]


In [68]:
df = pd.DataFrame(denselist,columns=feature_names)

In [69]:
df

Unnamed: 0,around,childern,fire,for,man,out,sat,the,walk,want
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
