In [90]:
import latexify
import math
import pandas as pd

In [81]:
@latexify.with_latex
def TFDIFeq(tf,df,N):
    #tf is the term frequency
    #df is the document frequency
    #N is the number of documents
    return tf * math.log(N/df)

TFDIFeq
print(TFDIFeq)

<latexify.core.with_latex.<locals>._LatexifiedFunction at 0x1a1f3e95da0>

In [1]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [2]:
bagOfWordsA = documentA.split()
bagOfWordsB = documentB.split()

In [3]:
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [4]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [5]:
dictA = dict.fromkeys(uniqueWords, 0)
dictA

{'sat': 0,
 'a': 0,
 'fire': 0,
 'children': 0,
 'around': 0,
 'out': 0,
 'the': 0,
 'man': 0,
 'walk': 0,
 'for': 0,
 'went': 0}

In [6]:
for word in bagOfWordsA:
    dictA[word] += 1

dictA

{'sat': 0,
 'a': 1,
 'fire': 0,
 'children': 0,
 'around': 0,
 'out': 1,
 'the': 1,
 'man': 1,
 'walk': 1,
 'for': 1,
 'went': 1}

In [32]:
total = sum(dictA.values())

In [35]:
tfA = {key:val/total for key, val in dictA.items()}
tfA

{'sat': 0.0,
 'a': 0.14285714285714285,
 'fire': 0.0,
 'children': 0.0,
 'around': 0.0,
 'out': 0.14285714285714285,
 'the': 0.14285714285714285,
 'man': 0.14285714285714285,
 'walk': 0.14285714285714285,
 'for': 0.14285714285714285,
 'went': 0.14285714285714285}

In [85]:
def createTFs(docs:list) -> list:
    uniqueWords = set()
    bagsOfWords = []
    for doc in docs:
        words = doc.split()
        bagsOfWords.append(words)
        uniqueWords = uniqueWords.union(set(words))

    tflist = []
    for bagOfWords in bagsOfWords:
        dct = dict.fromkeys(uniqueWords, 0)

        for word in bagOfWords:
            dct[word] += 1

        total = sum(dct.values())
        tf = {key:val/total for key, val in dct.items()}
        tflist.append(tf)

    return tflist


In [48]:
tfs = createTFs([documentA, documentB])
tfs

[{'sat': 0.0,
  'a': 0.14285714285714285,
  'fire': 0.0,
  'children': 0.0,
  'around': 0.0,
  'out': 0.14285714285714285,
  'the': 0.14285714285714285,
  'man': 0.14285714285714285,
  'walk': 0.14285714285714285,
  'for': 0.14285714285714285,
  'went': 0.14285714285714285},
 {'sat': 0.16666666666666666,
  'a': 0.0,
  'fire': 0.16666666666666666,
  'children': 0.16666666666666666,
  'around': 0.16666666666666666,
  'out': 0.0,
  'the': 0.3333333333333333,
  'man': 0.0,
  'walk': 0.0,
  'for': 0.0,
  'went': 0.0}]

In [86]:
def createIDFs(tfs:list) -> dict:
    N = len(tfs)
    uniqueWords = dict.fromkeys(tfs[0].keys(), 0)
    for word in uniqueWords.keys():
        for tf in tfs:
            if tf[word] > 0:
                uniqueWords[word] += 1
    idf = {key: math.log(N/val) for key, val in uniqueWords.items()}
    return idf

idf = createIDFs(tfs)
print(idf)


{'sat': 0.6931471805599453, 'a': 0.6931471805599453, 'fire': 0.6931471805599453, 'children': 0.6931471805599453, 'around': 0.6931471805599453, 'out': 0.6931471805599453, 'the': 0.0, 'man': 0.6931471805599453, 'walk': 0.6931471805599453, 'for': 0.6931471805599453, 'went': 0.6931471805599453}


In [87]:
def ComputeTFIDF(docs:list):
    tfs = createTFs(docs=docs)
    idf = createIDFs(tfs=tfs)
    TFIDFs = []
    for tf in tfs:
        tfidf = {key: val * idf[key] for key, val in tf.items()}
        TFIDFs.append(tfidf)

    return TFIDFs

In [88]:
TFIDFs = ComputeTFIDF(docs=[documentA, documentB])
TFIDFs

[{'sat': 0.0,
  'a': 0.09902102579427789,
  'fire': 0.0,
  'children': 0.0,
  'around': 0.0,
  'out': 0.09902102579427789,
  'the': 0.0,
  'man': 0.09902102579427789,
  'walk': 0.09902102579427789,
  'for': 0.09902102579427789,
  'went': 0.09902102579427789},
 {'sat': 0.11552453009332421,
  'a': 0.0,
  'fire': 0.11552453009332421,
  'children': 0.11552453009332421,
  'around': 0.11552453009332421,
  'out': 0.0,
  'the': 0.0,
  'man': 0.0,
  'walk': 0.0,
  'for': 0.0,
  'went': 0.0}]

In [94]:
df = pd.DataFrame(TFIDFs)
df

Unnamed: 0,sat,a,fire,children,around,out,the,man,walk,for,went
0,0.0,0.099021,0.0,0.0,0.0,0.099021,0.0,0.099021,0.099021,0.099021,0.099021
1,0.115525,0.0,0.115525,0.115525,0.115525,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
print(vectors)

  (0, 8)	0.42615959880289433
  (0, 3)	0.42615959880289433
  (0, 5)	0.42615959880289433
  (0, 9)	0.42615959880289433
  (0, 4)	0.42615959880289433
  (0, 7)	0.3032160644503863
  (1, 2)	0.40740123733358447
  (1, 0)	0.40740123733358447
  (1, 6)	0.40740123733358447
  (1, 1)	0.40740123733358447
  (1, 7)	0.5797386715376657


In [96]:
feature_names = vectorizer.get_feature_names()
feature_names

['around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went']

In [97]:
dense = vectors.todense()
dense

matrix([[0.        , 0.        , 0.        , 0.4261596 , 0.4261596 ,
         0.4261596 , 0.        , 0.30321606, 0.4261596 , 0.4261596 ],
        [0.40740124, 0.40740124, 0.40740124, 0.        , 0.        ,
         0.        , 0.40740124, 0.57973867, 0.        , 0.        ]])

In [100]:
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df.head()

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0
