## Task-1

The task is implementing TFIDF vectorizer on a collection of text documents. I will compare the results of my own implementation of TFIDF vectorizer with that of sklearns implemenation TFIDF vectorizer.

### Corpus

In [2]:
# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [4]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [5]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [0]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [0]:
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [0]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### My custom implementation

In [3]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy


**COMPUTE IDF**

In [4]:
def compidf(uniq,d, data):
    c=0
    lstc=[]
    for u in uniq:    
        for i in d:
            for j in i.keys():
                if u== j:
                    c+=1 #count of a word in the whole corpus
        lstc.append(c) # list of count of a word in the whole corpus
        c=0
    wordcount=dict(zip(uniq,lstc)) # dict of word and their count
    idf_val=[]
    for i, j in wordcount.items():
        idf= 1+ math.log((1+len(data))/(1+j)) #formula for IDF
        idf_val.append(idf)
    idfdict= dict(zip(uniq,idf_val))
    return idf_val, idfdict

In [5]:
def fit(data):
    d=[]
    unique_words=set()
    if isinstance(data,list):
        for i in data:
            d.append(dict(Counter(i.split()))) # d has dicts with words in the data as keys and freq as their values
            for j in i.split(" "):
                if len(j)<2:
                    continue
                unique_words.add(j)
        unique_words= sorted(list(unique_words)) # it is a set of all the unique words
        vocab= {j:i for i,j in enumerate(unique_words)} # it is a dict with keys as words and values as indexes
        idf_val, idfdict= compidf(unique_words,d, data)
        return unique_words, idf_val, idfdict, vocab,d
    else:
        print("wrong datatype")
    

In [12]:
#renaming corpus to data
data = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [7]:
uniq, idf_val, idfdict, vocab,d= fit(data)

**Function for Computing TF (this function is called in transform function)**

In [8]:
def comptf(d):
    l=[]
    listi=[]
    alpha=[]
    alphai=[]
    for k in range(len(d)):    
        for i,j in d[k].items():
            tf= j/sum(d[k].values()) #formula for term frequency
            l.append(tf) #appending tf values
            listi.append(i) # appending words
        alpha.append(l) #appending list of tf values
        alphai.append(listi) #appending list of words
        l=[]
        listi=[]
    return alphai, alpha

In [13]:
def transform(dataset,vocab,d):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        a,b=comptf(d)
        tflist=d.copy()
        for idx, val in enumerate(b):
            zipper= list(zip(a[idx], val))
            for i in zipper:
                tflist[idx][i[0]]=i[1]
        for idx,row in enumerate(tflist):# for each dict in the tflist
            for word, tfval in row.items():
            # it will return a dict type object where key is the word and values is its TF Value, {word:TF Value}                
                if len(word) < 2:
                    continue
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the Mult of TF and IDF of the word
                    tfidf= tfval*idfdict[word]
                    values.append(tfidf)
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab))))
    else:
        print("you need to pass list of strings")

In [10]:
yup= transform(data,vocab,d)

In [11]:
print(yup[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## Task-2

Implementing max features functionality: I will modify my fit and transform functions so that my vocab will contain only 50 terms with top idf scores. Here i will make use of a pickle file to load the corpus from this file and use it as input to your tfidf vectorizer.

In [6]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


**COMPUTE IDF**

In [14]:
def compidf(uniq,d,data):
    c=0
    lstc=[]
    for u in uniq:    
        for i in d:
            for j in i.keys():
                if u== j:
                    c+=1 #count of a word in the whole corpus
        lstc.append(c) # list of count of a word in the whole corpus
        c=0
    wordcount=dict(zip(uniq,lstc)) # dict of word and their count
    idf_val=[]
    for i, j in wordcount.items():
        idf= 1+ math.log((1+len(data))/(1+j)) #formula for IDF
        idf_val.append(idf)
    idfdict= dict(zip(uniq,idf_val))
    idfordered=sorted(idfdict.items(), key=lambda x: x[1], reverse=True)
    idfordered= idfordered[:50]
    idfordered= dict(idfordered)
    
    return idf_val, idfordered

In [15]:
def fit(data):
    d=[]
    unique_words=set()
    if isinstance(data,list):
        for i in data:
            d.append(dict(Counter(i.split()))) # d has dicts with words in the data as keys and freq as their values
            for j in i.split(" "):
                if len(j)<2:
                    continue
                unique_words.add(j)
        unique_words= sorted(list(unique_words)) # it is a set of all the unique words
        vocab= {j:i for i,j in enumerate(unique_words)} # it is a dict with keys as words and values as indexes
        idf_val, idfordered= compidf(unique_words,d, data)
        return unique_words, idf_val, idfordered,vocab,d
    else:
        print("wrong datatype")

In [18]:
unique_words, idf_val, idfordered,vocab,d= fit(corpus)

**Function for Computing TF (this function is called in transform function)**

In [19]:
def comptf(d):
    l=[]
    listi=[]
    alpha=[]
    alphai=[]
    for k in range(len(d)):    
        for i,j in d[k].items():
            tf= j/sum(d[k].values()) #formula for term frequency
            l.append(tf) #appending tf values
            listi.append(i) # appending words
        alpha.append(l) #appending list of tf values
        alphai.append(listi) #appending list of words
        l=[]
        listi=[]
    return alphai, alpha

In [29]:
def transform(dataset,idfordered,d):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        a,b=comptf(d)
        tflist=d.copy()
        for idx, val in enumerate(b):
            zipper= list(zip(a[idx], val))
            for i in zipper:
                tflist[idx][i[0]]=i[1]
        for idx,row in enumerate(tflist):# for each dict in the tflist
            for word, tfval in row.items():
            # it will return a dict type object where key is the word and values is its TF Value, {word:TF Value}                
                if len(word) < 2:
                    continue
                # we will check if its there in idfordered keys that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = idfordered.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the Mult of TF and IDF of the word
                    tfidf= tfval*idfordered[word]
                    values.append(tfidf)
        return normalize(csr_matrix((values, (rows,columns)), shape=(len(dataset),len(idfordered))))
    else:
        print("you need to pass list of strings")

In [24]:
spar= transform(corpus,idfordered,d)

In [28]:
print(spar[0].toarray())

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
