#                                                # TASK-1

In [72]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

# SkLearn Implementation

In [96]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

# Using SKLEARN IMPLEMENTATION

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer() #calling tfidf vectorizer and storing it in vect
final=vect.fit_transform(corpus) #CSR matrix is present here
print('         {}'.format('Vacabulory'))
print('')
print(vect.get_feature_names()) #prints set of unique words/vocabulary
print('')
print(type(final)) #type of the corpus
print('')
print('{}     {}'.format('row-coulmn','TFIDF value'))
print(final[0])
print('')
print('         {}'.format('SHAPE of the matrix'))
print('')
print(final.shape)


print(final[0].toarray())
print('the IDF values are:',vect.idf_)


         Vacabulory

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

<class 'scipy.sparse.csr.csr_matrix'>

row-coulmn     TFIDF value
  (0, 8)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045

         SHAPE of the matrix

(4, 9)
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
the IDF values are: [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


# Your Custom Implementation

# FIT FUNCTION

In [86]:
import warnings
warnings.filterwarnings('ignore')

def fit(corps):
    words=set()
    if isinstance(corpus,(list)):  #check if the datatypes match
        for row in corpus:         #for all the rows in the dataset
            for word in row.split(): #split the sentence into  individual words based when ever 'empty space' is encountered.
                if len(word) <2:  #ignore 'commas' as dataset is a list. by default is also considers 'comma' as a word.
                    continue #else continue
                words.add(word)  #appendthe word to the list of words
        words=sorted(list(words)) #sort the words in alphabetical order
        vocab={j:i for i,j in enumerate(words)} #return the unique words in the set
        return vocab
    else:
        print("please pass a list of sentenses")

voc=fit(corpus)
print(voc)


{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [76]:
print(voc.keys())

dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])


# for the above keys we have to find how many times each key occured in the whole corpus(inverse document frequency)

# COMPUTE IDF

In [77]:
idf=[]
total=len(corpus) #total number of documents in the corpus
def idfreq(corp,vocabulory): #calculates the freq of each unique word in the corpus
    for i in unique.keys(): #for each unique word
        count=0
        for row in corpus:  #for each sentence in the corpus
            if i in row.split(' '):
                count+=1
        idf.append(math.log((total+1)/(1+count))+1) #total=total nimber of documents ; count=number of docs containing 
    return idf
x=idfreq(corpus,voc)
print('the IDF values are',x,end='')


the IDF values are [1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]

# TRANSFORM FUNCTION

In [88]:
def transform(dataset,vocab,idf):
    rows = []
    columns = []
    values = []
    if isinstance(corpus, (list,)):
        for idx, row in enumerate(tqdm(corpus)): # for each document in the dataset
            TF_cnt = len(row.split(" "))
            # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
            word_freq = dict(Counter(row.split()))
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = unique.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the frequency of the word
                    # we are appending tfidf values into the values
                    values.append((freq/TF_cnt)*idf[unique[word]])
        return csr_matrix((values, (rows,columns)), shape=(len(corpus),len(voc)))
    else:
        print("you need to pass list of strings")

tnf = transform(corpus,voc,x)

print(tnf.shape)

res = normalize(tnf, norm='l2', axis=1)
print(res[0])
print('')
print(res[0].toarray())

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


(4, 9)
  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


# TASK-2

In [95]:
import warnings
warnings.filterwarnings("ignore")
import math
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
import pickle

with open('cleaned_strings', 'rb') as f:
    corpus1 = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus1))
#Fit function
def fit(dataset):
    unique_words = set()
    if isinstance(dataset, (list,)):
        for row in dataset:
            for word in row.split(" "):
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentence")
#Computing IDF
def computeIDF(dataset, fv):
    idf=[]
    Total = len(dataset)
    for i in fv.keys():
        count = 0
        for row in dataset:
            if i in row.split(" "):
                count+=1
        idf.append(math.log((Total+1)/(1+count))+1)
    return idf
#Transform function
def transform(dataset,vocab,idf):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): # for each document in the dataset
            TF_cnt = len(row.split(" "))
            word_freq = dict(Counter(row.split())) #frequency of each word
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word, -1) 
                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append((freq/TF_cnt)*idf[vocab[word]])
        return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    else:
        print("you need to pass list of strings")

voc1=fit(corpus1)
idf1=computeIDF(corpus1,voc1)
for i in voc1.keys():
    voc1[i]=idf1[voc1[i]]
#https://stackoverflow.com/questions/20577840/python-dictionary-sorting-in-descending-order-based-on-values
voc1 = sorted(voc1.items(), key=lambda kv: kv[1],reverse=True)
voc1=voc1[:50]
#https://stackoverflow.com/questions/10777271/python-using-enumerate-inside-list-comprehension
voc1={j[0]:i for i,j in enumerate(voc1)}
idf1=sorted(idf1,reverse=True)
idf1=idf1[:50]
print("\n unique words:\n",voc1)
print("--------------------------------------------------------------")
print("\n IDF values:\n",idf1)
print("--------------------------------------------------------------")
tnf1=transform(corpus1,voc1,idf1)
print(tnf1.shape)
print("\n")
res=normalize(tnf1,norm='l2',axis=1)
print("\n result \n",res[0])
print("--------------------------------------------------------------")
print("\n result to  array:\n",res[0].toarray())
print("--------------------------------------------------------------")

Number of documents in corpus =  746

 unique words:
 {'aailiyah': 0, 'abandoned': 1, 'abroad': 2, 'abstruse': 3, 'academy': 4, 'accents': 5, 'accessible': 6, 'acclaimed': 7, 'accolades': 8, 'accurate': 9, 'accurately': 10, 'achille': 11, 'ackerman': 12, 'actions': 13, 'adams': 14, 'add': 15, 'added': 16, 'admins': 17, 'admiration': 18, 'admitted': 19, 'adrift': 20, 'adventure': 21, 'aesthetically': 22, 'affected': 23, 'affleck': 24, 'afternoon': 25, 'aged': 26, 'ages': 27, 'agree': 28, 'agreed': 29, 'aimless': 30, 'aired': 31, 'akasha': 32, 'akin': 33, 'alert': 34, 'alike': 35, 'allison': 36, 'allow': 37, 'allowing': 38, 'alongside': 39, 'amateurish': 40, 'amaze': 41, 'amazed': 42, 'amazingly': 43, 'amusing': 44, 'amust': 45, 'anatomist': 46, 'angel': 47, 'angela': 48, 'angelina': 49}
--------------------------------------------------------------

 IDF values:
 [6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.92291800

100%|█████████████████████████████████████████████████████████████████████████████| 746/746 [00:00<00:00, 82927.85it/s]


(746, 50)



 result 
   (0, 30)	1.0
--------------------------------------------------------------

 result to  array:
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
--------------------------------------------------------------
