In [1]:
import pandas as pd
import numpy as np

In [2]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data']

In [3]:
words_set = set()

for i, doc in enumerate(corpus):
    words = doc.split(' ')
    print(f"{i}\tdoc: {doc} \n \twords: {words}")
    words_set = words_set.union(set(words))
    

print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

0	doc: data science is one of the most important fields of science 
 	words: ['data', 'science', 'is', 'one', 'of', 'the', 'most', 'important', 'fields', 'of', 'science']
1	doc: this is one of the best data science courses 
 	words: ['this', 'is', 'one', 'of', 'the', 'best', 'data', 'science', 'courses']
2	doc: data scientists analyze data 
 	words: ['data', 'scientists', 'analyze', 'data']
Number of words in the corpus: 14
The words in the corpus: 
 {'most', 'best', 'courses', 'the', 'data', 'important', 'of', 'analyze', 'science', 'this', 'scientists', 'fields', 'is', 'one'}


In [4]:
n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the
 
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
         
df_tf

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tf[w][i] = df_tf[w][i] + (1 / len(words))


Unnamed: 0,most,best,courses,the,data,important,of,analyze,science,this,scientists,fields,is,one
0,0.090909,0.0,0.0,0.090909,0.090909,0.090909,0.181818,0.0,0.181818,0.0,0.0,0.090909,0.090909,0.090909
1,0.0,0.111111,0.111111,0.111111,0.111111,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.111111
2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0


In [5]:
print("IDF of: ")
 
idf = {}
 
for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
     
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
             
    idf[w] =  np.log10(n_docs / k)
     
    print(f'{w:>15}: {idf[w]:>10}' )


IDF of: 
           most: 0.47712125471966244
           best: 0.47712125471966244
        courses: 0.47712125471966244
            the: 0.17609125905568124
           data:        0.0
      important: 0.47712125471966244
             of: 0.17609125905568124
        analyze: 0.47712125471966244
        science: 0.17609125905568124
           this: 0.47712125471966244
     scientists: 0.47712125471966244
         fields: 0.47712125471966244
             is: 0.17609125905568124
            one: 0.17609125905568124


In [6]:
df_tf_idf = df_tf.copy()
 
for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
         
df_tf_idf

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tf_idf[w][i] = df_tf[w][i] * idf[w]


Unnamed: 0,most,best,courses,the,data,important,of,analyze,science,this,scientists,fields,is,one
0,0.043375,0.0,0.0,0.016008,0.0,0.043375,0.032017,0.0,0.032017,0.0,0.0,0.043375,0.016008,0.016008
1,0.0,0.053013,0.053013,0.019566,0.0,0.0,0.019566,0.0,0.019566,0.053013,0.0,0.0,0.019566,0.019566
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11928,0.0,0.0,0.11928,0.0,0.0,0.0
