In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Windows-10-10.0.18362-SP0
Python 3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)]
NumPy 1.14.3
SciPy 1.1.0
Scikit-Learn 0.19.1


In [2]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [4]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [5]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [6]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [7]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [8]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [9]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

In [10]:
def fit(corpus):
    '''This function return vocab and the idf'''
    unique_words = set()#In ths set we will store the word so that we get unique word
    for row in corpus:#This for loop will visit each row and split that row and make union with unique_word
        unique_words=unique_words.union(row.split())
    unique_words=list(unique_words)#Here we are converting set to a list so that we can sort it easly
    unique_words.sort()#sorting the list
    vocab = {j:i for i,j in enumerate(unique_words)}#Here we are storing word and column in a dictonary

    td=len(corpus)#Here we are storing the total no of document in the corpus
    td=td+1#we added 1 according to the formula of scikit-learn
    b=[]#In this list we will store idf of each word 
    c=0 #We will keep count in c of the document n which word appear from vocab 
    for i in list(vocab):#This for loop will itterate each word in vocab for idf
        c=1#Here we stated from 1 to according to formula used in scikit-learn
        for row in corpus:#This for loop will visit through each document in corpus to check presence of word 
            if i in row.split():#Here we split the document on space and used the membership function to check presence of word in document
                c=c+1#We increse c by 1 when we find the word in document
        idf=1+math.log(td/c)#Here we use scikit-learn formula to calculate idf
        b.append(idf)#Storing idf in b
    df_vocab={i:j for i,j in zip(list(vocab),b)}#Here we are storing word and idf in a dictonary

    return vocab,df_vocab#returning vocab and idf

In [11]:
vocab,df_vocab=fit(corpus)#Calling fit function on corpus

In [12]:
print(list(vocab))#Converted dict to list and printed the corpus

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [13]:
print(list(df_vocab.values()))#Printing the values of dict df_vocab which is idf

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [14]:
def transform(corpus,vocab):
    '''This function return tfidf'''
    rows=[]#Here we will store row no of non zero values
    columns=[]#Here we will store column no of non zero values
    values=[]#Here we will store non zero values
    #All the three rows column and values contain information about same element in diff list
    for i,row in enumerate(tqdm(corpus)):#This will go through each document in the corpus
        l=len(row.split())#n l we are storing the no of token in documnt
        word_freq=dict(Counter(row.split()))#It return count of each token present in document
        for word, freq in word_freq.items():#In this for loop we will calculate tfidf and store row ,column and values in respective list
            rows.append(i)
            columns.append(vocab.get(word))
            t=freq/l
            t=t*df_vocab.get(word)
            values.append(t)
    tfidf=csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab)))#Here we are covering list rows,columns,values to sparce matrix
    tfidf=normalize(tfidf)#Here we used l2 normalization according to scikit-learn
    return tfidf

In [15]:
tfidf=transform(corpus,vocab)#Calling transform function

100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 4006.02it/s]


In [16]:
print(tfidf[0])#Printing tfidf of first document in corpus

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [17]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


The result is not matching with scikit-learn result because in scikit-learn there is bug<br>
If you change the line 8 of function to l=len(row) then the answer will match with scikit-learn

### See result after changingline 8

In [18]:
def transform(corpus,vocab):
    '''This function return tfidf'''
    rows=[]#Here we will store row no of non zero values
    columns=[]#Here we will store column no of non zero values
    values=[]#Here we will store non zero values
    #All the three rows column and values contain information about same element in diff list
    for i,row in enumerate(tqdm(corpus)):#This will go through each document in the corpus
        l=len(row)#n l we are storing the no of token in documnt
        word_freq=dict(Counter(row.split()))#It return count of each token present in document
        for word, freq in word_freq.items():#In this for loop we will calculate tfidf and store row ,column and values in respective list
            rows.append(i)
            columns.append(vocab.get(word))
            t=freq/l
            t=t*df_vocab.get(word)
            values.append(t)
    tfidf=csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab)))#Here we are covering list rows,columns,values to sparce matrix
    tfidf=normalize(tfidf)#Here we used l2 normalization according to scikit-learn
    return tfidf

In [19]:
tfidf=transform(corpus,vocab)#Calling transform function

100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 4011.77it/s]


In [20]:
print(tfidf[0])#Printing tfidf of first document in corpus

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [21]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


Here we see the two result has matched