<a href="https://colab.research.google.com/github/Yadav-Roshan/NLP/blob/main/03_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
text = ['Data science is the study of data to extract meaningful insights for business',
        'Data science is a multidisciplinary approach that combines principles and practices from the fields',
        'A data scientis on the other hand earns more than USD 14000 per year']

In [None]:
import string
def wordset(data):
  word_set = set()
  for doc in data:
    doc = doc.lower()
    token = doc.split(" ")
    word_set = word_set.union(set(token))

  return word_set

In [None]:
word_set = wordset(text)

In [None]:
word_set

{'14000',
 'a',
 'and',
 'approach',
 'business',
 'combines',
 'data',
 'earns',
 'extract',
 'fields',
 'for',
 'from',
 'hand',
 'insights',
 'is',
 'meaningful',
 'more',
 'multidisciplinary',
 'of',
 'on',
 'other',
 'per',
 'practices',
 'principles',
 'science',
 'scientis',
 'study',
 'than',
 'that',
 'the',
 'to',
 'usd',
 'year'}

In [None]:
len(word_set)

33

### Term Frequency = No of times term appear in document / Length of Document or Total no of words
### Inverse Document Frequency = No of documents / No of documents in which a specific term appears
### tfidf = tf*idf

In [None]:
def term_freq(data, words):
  num_doc = len(data)
  num_words = len(words)

  df_tf = pd.DataFrame(np.zeros((num_doc, num_words)), columns = list(words))

  for row in range(num_doc):
    word = [w.lower() for w in data[row].split(' ')]

    for w in word:
      df_tf[w][row] = df_tf[w][row] + (1/len(word))


  return df_tf

In [None]:
term_frequency = term_freq(text, word_set)
term_frequency

Unnamed: 0,of,science,scientis,business,than,a,14000,on,the,year,...,usd,data,more,insights,practices,principles,hand,multidisciplinary,combines,meaningful
0,0.076923,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.076923,0.0,...,0.0,0.153846,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.076923
1,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.071429,0.0,...,0.0,0.071429,0.0,0.0,0.071429,0.071429,0.0,0.071429,0.071429,0.0
2,0.0,0.0,0.071429,0.0,0.071429,0.071429,0.071429,0.071429,0.071429,0.071429,...,0.071429,0.071429,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.0


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
def inverse_doc_freq(data, words):
  idf = {}
  num_doc = len(data)
  num_words = len(words)

  for w in words:
    count = 0;

    for row in range(num_doc):
      wrd = [wrd.lower() for wrd in text[row].split(" ")]
      if w in wrd:
        count+=1

    idf[w] = np.log(num_doc/count)

  return idf


In [None]:
idf = inverse_doc_freq(text, word_set)

In [None]:
tf_idf = pd.DataFrame()
for col in term_frequency.columns:
  tf_idf[col] = term_frequency[col]*idf[col]

In [None]:
tf_idf

Unnamed: 0,of,science,scientis,business,than,a,14000,on,the,year,...,usd,data,more,insights,practices,principles,hand,multidisciplinary,combines,meaningful
0,0.084509,0.03119,0.0,0.084509,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.084509,0.0,0.0,0.0,0.0,0.0,0.084509
1,0.0,0.028962,0.0,0.0,0.0,0.028962,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.078472,0.078472,0.0,0.078472,0.078472,0.0
2,0.0,0.0,0.078472,0.0,0.078472,0.028962,0.078472,0.078472,0.0,0.078472,...,0.078472,0.0,0.078472,0.0,0.0,0.0,0.078472,0.0,0.0,0.0


In [None]:
def tfidf(term_frequency, inv_df, words, data):
  num_doc = len(data)
  num_words = len(words)

  df_tf_idf = term_frequency
  for w in words:
    for i in range(num_doc):
      df_tf_idf[w][i] = df_tf_idf[w][i]*inv_df[w]


  return df_tf_idf

In [None]:
tf_idf = tfidf(term_frequency, idf, word_set, text)
tf_idf

Unnamed: 0,of,science,scientis,business,than,a,14000,on,the,year,...,usd,data,more,insights,practices,principles,hand,multidisciplinary,combines,meaningful
0,0.084509,0.03119,0.0,0.084509,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.084509,0.0,0.0,0.0,0.0,0.0,0.084509
1,0.0,0.028962,0.0,0.0,0.0,0.028962,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.078472,0.078472,0.0,0.078472,0.078472,0.0
2,0.0,0.0,0.078472,0.0,0.078472,0.028962,0.078472,0.078472,0.0,0.078472,...,0.078472,0.0,0.078472,0.0,0.0,0.0,0.078472,0.0,0.0,0.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
vec = TfidfVectorizer()

tfidf = vec.fit_transform(text)

In [None]:
print(tfidf.toarray())

[[0.         0.         0.         0.30287826 0.         0.35776956
  0.         0.30287826 0.         0.30287826 0.         0.
  0.30287826 0.23034673 0.30287826 0.         0.         0.30287826
  0.         0.         0.         0.         0.         0.23034673
  0.         0.30287826 0.         0.         0.17888478 0.30287826
  0.         0.        ]
 [0.         0.30352608 0.30352608 0.         0.30352608 0.17926739
  0.         0.         0.30352608 0.         0.30352608 0.
  0.         0.23083941 0.         0.         0.30352608 0.
  0.         0.         0.         0.30352608 0.30352608 0.23083941
  0.         0.         0.         0.30352608 0.17926739 0.
  0.         0.        ]
 [0.29238198 0.         0.         0.         0.         0.17268551
  0.29238198 0.         0.         0.         0.         0.29238198
  0.         0.         0.         0.29238198 0.         0.
  0.29238198 0.29238198 0.29238198 0.         0.         0.
  0.29238198 0.         0.29238198 0.         

In [None]:
pd.DataFrame(tfidf.toarray(), columns = vec.get_feature_names_out())

Unnamed: 0,14000,and,approach,business,combines,data,earns,extract,fields,for,...,principles,science,scientis,study,than,that,the,to,usd,year
0,0.0,0.0,0.0,0.302878,0.0,0.35777,0.0,0.302878,0.0,0.302878,...,0.0,0.230347,0.0,0.302878,0.0,0.0,0.178885,0.302878,0.0,0.0
1,0.0,0.303526,0.303526,0.0,0.303526,0.179267,0.0,0.0,0.303526,0.0,...,0.303526,0.230839,0.0,0.0,0.0,0.303526,0.179267,0.0,0.0,0.0
2,0.292382,0.0,0.0,0.0,0.0,0.172686,0.292382,0.0,0.0,0.0,...,0.0,0.0,0.292382,0.0,0.292382,0.0,0.172686,0.0,0.292382,0.292382
