In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

In [2]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [3]:
def fit(dataset):
    # To find unique words in corpus
    storage_set = set() # set only keeps unique entries
    if isinstance(dataset,(list,)):
        for document in dataset:
            for word in document.split(" "):
                storage_set.add(word)
        storage_set = sorted(list(storage_set))
        vocab = {j:i for i,j in enumerate(storage_set)}
        return vocab
    else:
        print("you need to pass list of sentance")

vocab =  fit(corpus)
# print(vocab)
vocab_list = list(vocab.keys())
print(vocab_list)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [4]:
def computeIDF(vocabulary, dataset):
    # To find Inverse Document Frequency of unique words / vocabulary
    idf_val = dict()
    for word in vocabulary:
        count = 0 # here we collect the term t as it appears in different documents
        for doc in dataset:
            if word in doc.split(' '):
                count += 1
                
        value = 1 + math.log ((1 + (len(dataset))) / (1 + count))
        # Formula adjusted to sk_learn
        idf_val[word] = value
    return idf_val

IDF = computeIDF(vocab_list, corpus)
print(IDF)

{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


In [6]:
def transform(dataset,vocabulary, idf_values):
    # To find tfidf values and change it to matrix
    rows = []
    columns = []
    values = []
    for idx, doc in enumerate(dataset):
        N = len([word for word in doc.split(' ')]) # Number of documents in corpus
        word_freq = dict(Counter(doc.split(' ')))
        tf = {key:(val/N) for key,val in word_freq.items()}
        for word, norm_freq in tf.items():
            if len(word) < 2:
                continue;
            col_index = vocabulary.get(word, -1)
            if col_index !=-1:
                rows.append(idx)
                columns.append(col_index)
                idf = idf_values[word]
                values.append(norm_freq * idf) # tfidf values
    return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))), norm='l2');


output = (transform(dataset=corpus, vocabulary=vocab, idf_values=IDF).toarray())
print(output[0])

[0.         0.46979139 0.58028582 0.38408524 0.         0.
 0.38408524 0.         0.38408524]


In [7]:
print(transform(corpus, vocab, IDF).shape)
# print(transform(corpus, vocab, IDF).toarray())

(4, 9)


In [8]:
output = (transform(dataset=corpus, vocabulary=vocab, idf_values=IDF))
print(output[0])

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


# TASK 2

Implement max features functionality:

As a part of this task you have to modify your fit and transform functions so that your vocab will contain only 50 terms with top idf scores.

This task is similar to your previous task, just that here your vocabulary is limited to only top 50 features names based on their idf values. Basically your output will have exactly 50 columns and the number of rows will depend on the number of documents you have in your corpus.

Here you will be give a pickle file, with file name cleaned_strings. You would have to load the corpus from this file and use it as input to your tfidf vectorizer.

Steps to approach this task:

You would have to write both fit and transform methods for your custom implementation of tfidf vectorizer, just like in the previous task. Additionally, here you have to limit the number of features generated to 50 as described above.

Now sort your vocab based in descending order of idf values and print out the words in the sorted voacb after you fit your data. Here you should be getting only 50 terms in your vocab. And make sure to print idf values for each term in your vocab.

Make sure the output of your implementation is a sparse matrix. Before generating the final output, you need to normalize your sparse matrix using L2 normalization. You can refer to this link https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html

Now check the output of a single document in your collection of documents, you can convert the sparse matrix related only to that document into dense matrix and print it. And this dense matrix should contain 1 row and 50 columns.

In [9]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [10]:
def fit(dataset):
    storage_set = set()
    for document in dataset:
        for word in document.split(" "):
            storage_set.add(word)
    storage_set = sorted(list(storage_set))
    voc = {j:i for i,j in enumerate(storage_set)}
    return voc

vocab =  fit(corpus)
print(vocab)
vocab_list = list(vocab.keys())
# # print(vocab_list)



In [11]:
def computeIDF(vocabulary, dataset):
    idf_val = dict()
    for word in vocabulary:
        count = 0
        for doc in dataset:
            if word in doc.split(' '):
                count += 1
                
        value = 1 + math.log ((1 + (len(dataset))) / (1 + count))
        idf_val[word] = value
    return idf_val

IDF = computeIDF(vocab_list, corpus)
# sort IDF according to the values in decending order and limit it to top 50 values
sorted_idf = {k: v for k, v in sorted(IDF.items(), key=lambda item: item[1], reverse=True)[:50]}
print(sorted_idf)
# print(sorted(sorted_idf))
# sorted_vocab = (sorted_idf.keys())
# print(sorted_vocab)

{'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.922918004572872, 'abstruse': 6.922918004572872, 'academy': 6.922918004572872, 'accents': 6.922918004572872, 'accessible': 6.922918004572872, 'acclaimed': 6.922918004572872, 'accolades': 6.922918004572872, 'accurate': 6.922918004572872, 'accurately': 6.922918004572872, 'achille': 6.922918004572872, 'ackerman': 6.922918004572872, 'actions': 6.922918004572872, 'adams': 6.922918004572872, 'add': 6.922918004572872, 'added': 6.922918004572872, 'admins': 6.922918004572872, 'admiration': 6.922918004572872, 'admitted': 6.922918004572872, 'adrift': 6.922918004572872, 'adventure': 6.922918004572872, 'aesthetically': 6.922918004572872, 'affected': 6.922918004572872, 'affleck': 6.922918004572872, 'afternoon': 6.922918004572872, 'aged': 6.922918004572872, 'ages': 6.922918004572872, 'agree': 6.922918004572872, 'agreed': 6.922918004572872, 'aimless': 6.922918004572872, 'aired': 6.922918004572872, 'akasha': 6.922918004572872, '

In [12]:
def transform(dataset,vocabulary, idf_values):
    
    idf_dict = {}
    # create a dictionary of vocabulary words and its dimensions
    vocab_dim = {word:dimension for dimension,word in enumerate(vocabulary)}
    
    
    rows = []
    columns = []
    values = []
    
    for idx,val in enumerate(vocabulary):
        idf_dict[val] = idf_values[val]
        
        
    for idx, doc in enumerate(dataset):
        N = len([word for word in doc.split(' ')])
        word_freq = dict(Counter(doc.split(' ')))
        tf = {key:float(val/N) for key,val in word_freq.items()}
        for word, norm_freq in tf.items():
            if len(word) < 2:
                continue;
            col_index = vocab_dim.get(word, -1)
            if col_index !=-1:
                rows.append(idx)
                columns.append(col_index)
                idf = idf_dict[word]
                values.append(norm_freq * idf)
    return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocabulary))), norm='l2');


output = (transform(dataset=corpus, vocabulary=sorted_idf, idf_values=IDF))
print(output)

  (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.37796447300922725
  (135, 10)	0.37796447300922725
  (135, 18)	0.37796447300922725
  (135, 20)	0.37796447300922725
  (135, 36)	0.37796447300922725
  (135, 40)	0.37796447300922725
  (135, 41)	0.37796447300922725
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865475
  (548, 32)	0.7071067811865475
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865475
  (644, 27)	0.7071067811865475
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0


In [13]:
print(output[0].shape)
print(output[135].toarray())

(1, 50)
[[0.         0.         0.         0.37796447 0.         0.
  0.         0.         0.         0.         0.37796447 0.
  0.         0.         0.         0.         0.         0.
  0.37796447 0.         0.37796447 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.37796447 0.         0.         0.         0.37796447 0.37796447
  0.         0.         0.         0.         0.         0.
  0.         0.        ]]
