In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]


In [3]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)


Number of words in the corpus: 14
The words in the corpus: 
 {'scientists', 'most', 'this', 'courses', 'data', 'best', 'analyze', 'one', 'the', 'of', 'important', 'science', 'fields', 'is'}


# Term Frequency
 Now we can create a dataframe by the number of documents in the corpus and the word set, and use that information to compute the term frequency (TF):

In [4]:
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the 

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
df_tf


Unnamed: 0,scientists,most,this,courses,data,best,analyze,one,the,of,important,science,fields,is
0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,0.090909,0.090909,0.181818,0.090909,0.181818,0.090909,0.090909
1,0.0,0.0,0.111111,0.111111,0.111111,0.111111,0.0,0.111111,0.111111,0.111111,0.0,0.111111,0.0,0.111111
2,0.25,0.0,0.0,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculating Inverse Document Frequency


In [5]:
print("IDF of: ")

idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k)
    
    print(f'{w:>15}: {idf[w]:>10}' )


IDF of: 
     scientists: 0.47712125471966244
           most: 0.47712125471966244
           this: 0.47712125471966244
        courses: 0.47712125471966244
           data:        0.0
           best: 0.47712125471966244
        analyze: 0.47712125471966244
            one: 0.17609125905568124
            the: 0.17609125905568124
             of: 0.17609125905568124
      important: 0.47712125471966244
        science: 0.17609125905568124
         fields: 0.47712125471966244
             is: 0.17609125905568124


In [6]:
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
        
df_tf_idf


Unnamed: 0,scientists,most,this,courses,data,best,analyze,one,the,of,important,science,fields,is
0,0.0,0.043375,0.0,0.0,0.0,0.0,0.0,0.016008,0.016008,0.032017,0.043375,0.032017,0.043375,0.016008
1,0.0,0.0,0.053013,0.053013,0.0,0.053013,0.0,0.019566,0.019566,0.019566,0.0,0.019566,0.0,0.019566
2,0.11928,0.0,0.0,0.0,0.0,0.0,0.11928,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus)

In [9]:
print(type(tf_idf_vector), tf_idf_vector.shape)

<class 'scipy.sparse.csr.csr_matrix'> (3, 14)


In [11]:
tf_idf_array = tf_idf_vector.toarray()

print(tf_idf_array)


[[0.         0.         0.         0.18952581 0.32089509 0.32089509
  0.24404899 0.32089509 0.48809797 0.24404899 0.48809797 0.
  0.24404899 0.        ]
 [0.         0.40029393 0.40029393 0.23642005 0.         0.
  0.30443385 0.         0.30443385 0.30443385 0.30443385 0.
  0.30443385 0.40029393]
 [0.54270061 0.         0.         0.64105545 0.         0.
  0.         0.         0.         0.         0.         0.54270061
  0.         0.        ]]


In [13]:
words_set = tr_idf_model.get_feature_names_out()

print(words_set)


['analyze' 'best' 'courses' 'data' 'fields' 'important' 'is' 'most' 'of'
 'one' 'science' 'scientists' 'the' 'this']


In [14]:
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

df_tf_idf


Unnamed: 0,analyze,best,courses,data,fields,important,is,most,of,one,science,scientists,the,this
0,0.0,0.0,0.0,0.189526,0.320895,0.320895,0.244049,0.320895,0.488098,0.244049,0.488098,0.0,0.244049,0.0
1,0.0,0.400294,0.400294,0.23642,0.0,0.0,0.304434,0.0,0.304434,0.304434,0.304434,0.0,0.304434,0.400294
2,0.542701,0.0,0.0,0.641055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.542701,0.0,0.0
