### Corpus

In [53]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [55]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [56]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [57]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [58]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [59]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0])
print('='*50)
print(skl_output[0].toarray())

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [60]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with the above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy
import pandas as pd


In [61]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [62]:
def fit(data1):
    unq_words = set()
    if isinstance(data1, (list)):
        for row in data1:
            for wrd in row.split(" "):
                if len(wrd) < 2:
                    continue
                unq_words.add(wrd)
        unq_words = sorted(list(unq_words))
        worde = list(enumerate(unq_words))
        vocab_dict = {}
        for i in range(len(worde)):
            vocab_dict[worde[i][1]] = worde[i][0]
        return vocab_dict
    else:
        print("pass list of sentence")

In [63]:
vocab = fit(corpus)
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


The obtained vocab is same as get_feature_names

In [64]:
import math
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]
def idf(x):
    idf_val = {}
    count_dict = {}
    for w in list(vocab.keys()):
        count_d = 0
        for i in x:
            if w in i.split():
                count_d = count_d+1
        count_dict[w] = count_d
        #print(count_dict[w])
        idf_val[w] = 1 + math.log((1+len(x))/(1+count_dict[w]))
    #print(count_dict)
    return idf_val

print(idf(corpus))

def tf(y):
    tf_val = {}
    for row in y:
        wrd_frq = dict(Counter(row.split()))
        for w in list(vocab.keys()):
            if w in wrd_frq.keys():
                tf = wrd_frq[w]/(sum(wrd_frq.values()))
                tf_val[w] = tf
    #print(tf_val)
    return tf_val

#print(tf(corpus))
            
            



{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


idf values are same as obtained from vectorizer.idf_

In [65]:
def transform(data1, vocab):
    srow = []
    scolumn = []
    svalue = []
    if isinstance(data1, list):
        for indx,row in enumerate(tqdm(data1)):
            wrd_frq = dict(Counter(row.split()))
            for wrd,frq in wrd_frq.items():
                if len(wrd)<2:
                    continue
                col_indx = vocab.get(wrd, -2)
                
                if col_indx != -2:
                    tf_idf = tf(corpus)[wrd]*idf(corpus)[wrd]
                    srow.append(indx)
                    scolumn.append(col_indx)
                    svalue.append(tf_idf)
        
        return csr_matrix((svalue, (srow, scolumn)), shape=(len(data1), len(vocab)))
    else:
        print("need to pass list of strings")                 
   

In [66]:
#strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
#           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
vocab = fit(corpus)
#print(list(vocab.keys()))
m = transform(corpus, vocab)
#print(m)
print(normalize(m, norm='l2')[0])
#print(m.toarray())
print('='*50)

print(normalize(m, norm='l2')[0].toarray())

100%|██████████| 4/4 [00:00<00:00, 1000.49it/s]

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





the values are same as obtained from skl_output[0].toarray()

In [67]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open(r'C:\Users\HP\OneDrive\Applied ai\Module 3\assignment\Implementing TFIDF vectorizer\cleaned_strings', 'rb') as f:
    corpus1 = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus1))

Number of documents in corpus =  746


In [68]:
# Write your code here.
# Try not to hardcode any values.
# Make sure its well documented and readble with appropriate comments.

In [69]:
def fit(data1):
    unq_words = set()
    if isinstance(data1, (list)):
        for row in data1:
            for wrd in row.split(" "):
                if len(wrd) < 2:
                    continue
                unq_words.add(wrd)
        unq_words = sorted(list(unq_words))
        worde = list(enumerate(unq_words))
        vocab_dict = {}
        for i in range(len(worde)):
            vocab_dict[worde[i][1]] = worde[i][0]
        return vocab_dict
    else:
        print("pass list of sentence")

In [70]:
vocab = fit(corpus1)
#print(vocab)

In [71]:
def idf(x):
    idf_val = {}
    count_dict = {}
    for w in list(vocab.keys()):
        count_d = 0
        for i in x:
            if w in i.split():
                count_d = count_d+1
        count_dict[w] = count_d
        #print(count_dict[w])
        idf_val[w] = 1 + math.log((1+len(x))/(1+count_dict[w]))
    #print(count_dict)
    return idf_val

#print(idf(corpus1))


            
            


In [72]:
def idf_50(k):
    vocab_50_sort = {}
    idf_sort_50 = {}
    d_initial = idf(k)
    idf_sort = sorted(d_initial.items(), key = lambda x:x[1], reverse= True)
    idf_sort_50lst = idf_sort[0:50]
    #print(idf_sort_50lst)
    for i in range(len(idf_sort_50lst)):
        vocab_50_sort[idf_sort[i][0]] = vocab[idf_sort[i][0]]
        idf_sort_50[idf_sort[i][0]] = idf_sort[i][1]
    return vocab_50_sort, idf_sort_50
vocab_50,idfnew = idf_50(vocab)
print(vocab_50)
print(idfnew)

{'aailiyah': 0, 'abandoned': 1, 'ability': 2, 'abroad': 3, 'absolutely': 4, 'abstruse': 5, 'abysmal': 6, 'academy': 7, 'accents': 8, 'accessible': 9, 'acclaimed': 10, 'accolades': 11, 'accurate': 12, 'accurately': 13, 'accused': 14, 'achievement': 15, 'achille': 16, 'ackerman': 17, 'act': 18, 'acted': 19, 'acting': 20, 'action': 21, 'actions': 22, 'actor': 23, 'actors': 24, 'actress': 25, 'actresses': 26, 'actually': 27, 'adams': 28, 'adaptation': 29, 'add': 30, 'added': 31, 'addition': 32, 'admins': 33, 'admiration': 34, 'admitted': 35, 'adorable': 36, 'adrift': 37, 'adventure': 38, 'advise': 39, 'aerial': 40, 'aesthetically': 41, 'affected': 42, 'affleck': 43, 'afraid': 44, 'africa': 45, 'afternoon': 46, 'age': 47, 'aged': 48, 'ages': 49}
{'aailiyah': 8.27482599910299, 'abandoned': 8.27482599910299, 'ability': 8.27482599910299, 'abroad': 8.27482599910299, 'absolutely': 8.27482599910299, 'abstruse': 8.27482599910299, 'abysmal': 8.27482599910299, 'academy': 8.27482599910299, 'accents':

In [73]:
def tf(y,vocab_50):
    tf_val = {}
    for row in y:
        wrd_frq = dict(Counter(row.split()))
        for w in list(vocab_50.keys()):
            if w in wrd_frq.keys():
                tf = wrd_frq[w]/(sum(wrd_frq.values()))
                tf_val[w] = tf
    #print(tf_val)
    return tf_val
tfnew = tf(corpus1,vocab_50)
print(tfnew)

{'acting': 0.1111111111111111, 'adorable': 0.09090909090909091, 'absolutely': 0.14285714285714285, 'actor': 0.14285714285714285, 'actors': 0.1, 'actually': 0.14285714285714285, 'addition': 0.09090909090909091, 'acted': 0.2, 'accused': 0.05555555555555555, 'afraid': 0.14285714285714285, 'advise': 0.5, 'affleck': 0.05555555555555555, 'age': 0.1, 'abstruse': 0.0014326647564469914, 'accurately': 0.0014326647564469914, 'action': 0.058823529411764705, 'actress': 0.25, 'admiration': 0.0014326647564469914, 'adrift': 0.0014326647564469914, 'aerial': 0.25, 'actresses': 0.1111111111111111, 'actions': 0.1, 'adventure': 0.3333333333333333, 'affected': 0.043478260869565216, 'abroad': 0.06666666666666667, 'admitted': 0.1111111111111111, 'admins': 0.16666666666666666, 'abandoned': 0.03225806451612903, 'afternoon': 0.125, 'aged': 0.07692307692307693, 'add': 0.08333333333333333, 'accolades': 0.07692307692307693, 'abysmal': 0.0024271844660194173, 'accents': 0.3333333333333333, 'africa': 0.0625, 'academy'

In [74]:
def transform(data1, vocab_50):
    srow = []
    scolumn = []
    svalue = []
    if isinstance(data1, (list,)):
        for indx,row in enumerate(tqdm(data1)):
            for wrd,frq in idfnew.items():
                if len(wrd)<2:
                    continue
                col_indx = vocab_50.get(wrd, -2)

                
                if col_indx != -2:
                    tf_idf = tfnew[wrd]*idfnew[wrd]
                    srow.append(indx)
                    scolumn.append(col_indx)
                    svalue.append(tf_idf)
        
        return normalize((csr_matrix((svalue, (srow, scolumn)), shape=(len(data1), len(vocab_50)))), norm = 'l2')
    else:
        print("need to pass list of strings")  
   

In [75]:
vocab = fit(corpus1)

n = transform(corpus1, vocab_50)

print(n[0].toarray())
print(n[0].toarray().shape)
print(n.shape)
#print(n)

100%|██████████| 746/746 [00:00<00:00, 14028.46it/s]

[[0.0898299  0.03187512 0.16468814 0.06587526 0.14116127 0.00141566
  0.00239837 0.07058063 0.32937629 0.19762577 0.06175805 0.07600991
  0.1097921  0.00141566 0.05489605 0.00239837 0.06587526 0.00239837
  0.1097921  0.19762577 0.1097921  0.05812523 0.09881289 0.14116127
  0.09881289 0.24703222 0.1097921  0.14116127 0.04705376 0.06175805
  0.08234407 0.1097921  0.0898299  0.16468814 0.00141566 0.1097921
  0.0898299  0.00141566 0.32937629 0.49406443 0.24703222 0.07600991
  0.04296212 0.05489605 0.14116127 0.06175805 0.12351611 0.09881289
  0.07600991 0.00239837]]
(1, 50)
(746, 50)



