### Corpus

In [None]:
#SkLearn Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [None]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Custom implementation

In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [None]:
def fit(dataset):
    uni_words = set()
    idf_dict={}
    N=len(dataset)
    if isinstance(dataset, (list,)):
      for x in dataset:
        for y in x.split():
          if len(y)<2:
            continue
          uni_words.add(y)
      uni_words = sorted(list(uni_words))
      vocab = {j:i for i,j in enumerate(uni_words)}
      for i in uni_words:
        cnt=0
        for sents in dataset:
          if i in sents.split():
            cnt=cnt+1
            idf_dict[i]=(math.log((1+N)/(cnt+1)))+1
    return vocab, idf_dict
 
      

In [None]:
vocab, idf_of_vocab=fit(corpus)

In [None]:
print(type(vocab))
print(type(idf_of_vocab))

<class 'dict'>
<class 'dict'>


In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]


In [None]:
print(list(idf_of_vocab.keys()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
print(list(idf_of_vocab.values()))

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [None]:
sorted_idx=(sorted(list(idf_of_vocab.values()),reverse=True))
for i in sorted_idx:
  print(list(idf_of_vocab.keys())[list(idf_of_vocab.values()).index(i)])

and
and
and
and
first
document
is
is
is


In [None]:
def transform(dataset,vocabulary,idf_values):
     sparse_matrix= csr_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)
     for row  in range(0,len(dataset)):
       number_of_words_in_sentence=Counter(dataset[row].split())
       for word in dataset[row].split():
           if word in  list(vocabulary.keys()):
               tf_idf_value=(number_of_words_in_sentence[word]/len(dataset[row].split()))*(idf_values[word])
               sparse_matrix[row,vocabulary[word]]=tf_idf_value
     print("NORM FORM\n",normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False))
     output =normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
     return output

In [None]:
custom_final_output=transform(corpus,vocab,idf_of_vocab)
print(custom_final_output.shape) 

NORM FORM
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149
(4, 9)


  self._set_intXint(row, col, x.flat[0])


In [None]:
print(custom_final_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Here, we have a file 'cleaned_strings'. We print out the top 50 IDF values from the text corpus in it. 

In [None]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type
from google.colab import drive
drive.mount('/content/drive')
import pickle
with open('/content/drive/My Drive/cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Mounted at /content/drive
Number of documents in corpus =  746


In [None]:
from math import log10

def idf(dataset,word):
  count =0
  for row in dataset:
    if word in row:
      count = count+1
  return count

def fit(dataset):
  unique_words=[]
  IDF_val=[]
  for row in dataset:
    for word in row.split(" "):
      if len(word)>=2 and word not in unique_words:
        unique_words.append(word)

  for word in unique_words:
    val=log10(len(dataset)/idf(dataset,word))
    IDF_val.append(val)
  for i in range(len(IDF_val)):
    for j in range(len(IDF_val)):
      if IDF_val[j] < IDF_val[i]:
        t_val = IDF_val[j] 
        IDF_val[j] = IDF_val[i]
        IDF_val[i] = t_val

        t_word = unique_words[j]
        unique_words[j] = unique_words[i]
        unique_words[i] = t_word

  vocab = {j:i for i,j in enumerate(unique_words[:50])}
  return vocab

vocab = fit(corpus)
print(vocab)                    

{'aimless': 0, 'distressed': 1, 'drifting': 2, 'nearly': 3, 'attempting': 4, 'artiness': 5, 'gerardo': 6, 'emptiness': 7, 'messages': 8, 'buffet': 9, 'science': 10, 'teacher': 11, 'owls': 12, 'florida': 13, 'muppets': 14, 'overdue': 15, 'screenplay': 16, 'post': 17, 'practically': 18, 'structure': 19, 'tightly': 20, 'constructed': 21, 'vitally': 22, 'occurs': 23, 'content': 24, 'dozen': 25, 'highest': 26, 'superlative': 27, 'require': 28, 'puzzle': 29, 'solving': 30, 'fit': 31, 'pulls': 32, 'punches': 33, 'graphics': 34, 'insane': 35, 'massive': 36, 'unlockable': 37, 'properly': 38, 'rocks': 39, 'doomed': 40, 'conception': 41, 'minor': 42, 'changing': 43, 'confirm': 44, 'generic': 45, 'managed': 46, 'exaggerating': 47, 'trailer': 48, 'carrell': 49}


In [None]:
def transform(dataset,vocab):
  rows=[]
  columns =[]
  values=[]
  tf_val=[]
  idf_val=[]
  for idx,row in enumerate(dataset):
    word_freq = dict(Counter(row.split()))
    for word,freq in word_freq.items():
      if len(word)<2:
        continue
      col_index = vocab.get(word,-1)
      if col_index != -1:
        rows.append(idx)
        columns.append(col_index)
        val = (freq/len(row.split()))*(log10(len(dataset)/idf(dataset,word)))
        values.append(val)

  return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))        

In [None]:
vocab = fit(corpus)
tf_idf_vect = transform(corpus, vocab)
print(tf_idf_vect)

  (0, 0)	0.3590923534340836
  (0, 1)	0.3590923534340836
  (0, 2)	0.3590923534340836
  (1, 3)	0.3191932030525187
  (2, 4)	0.15119678039329834
  (2, 5)	0.15119678039329834
  (4, 6)	0.2872738827472669
  (5, 7)	0.2872738827472669
  (7, 8)	0.3191932030525187
  (9, 9)	0.47878980457877807
  (9, 10)	0.47878980457877807
  (9, 11)	0.47878980457877807
  (10, 12)	0.9575796091575561
  (11, 13)	0.3590923534340836
  (12, 14)	0.7181847068681672
  (16, 15)	0.2209799098055899
  (17, 16)	0.20519563053376202
  (17, 17)	0.20519563053376202
  (18, 18)	0.41039126106752405
  (19, 19)	0.006543824208365988
  (19, 20)	0.006543824208365988
  (19, 21)	0.006543824208365988
  (19, 22)	0.006543824208365988
  (19, 23)	0.006543824208365988
  (19, 24)	0.006543824208365988
  (19, 25)	0.006543824208365988
  (19, 26)	0.006543824208365988
  (19, 27)	0.006543824208365988
  (19, 28)	0.006543824208365988
  (19, 29)	0.006543824208365988
  (19, 30)	0.006543824208365988
  (19, 31)	0.013087648416731976
  (19, 32)	0.006543824208365



*   Here, we have compared to the custom implementation to sklearn's.
*   We have also printed out the top 50 idf values.

