# ASSIGNMENT 3

##  Task 1: Implementing TFIDF vectorizer 

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from tqdm import tqdm
import os
from collections import Counter
from scipy.sparse import csr_matrix
import math

In [2]:
import pickle
with open('cleaned_strings', 'rb') as f:
    data = pickle.load(f)

In [3]:
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        return vocab
    else:
        print("you need to pass list of sentance")
        

In [4]:
def term_freq(word, doc):
    temp= dict(Counter(doc.split()))
    if word in doc.split():
        return temp[word]/sum(temp.values())
    else:
        return -1

In [5]:
def inv_doc_freq(w, dataset):
    num_of_sentences_having_w = 0
    for row in dataset:
        if w in row.split():
            num_of_sentences_having_w+=1 
    if num_of_sentences_having_w == len(dataset):
        return 0
    else:
        return math.log(len(dataset)/(num_of_sentences_having_w+1))
 

In [6]:
def transform(dataset,vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)): # for each document in the dataset
            # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
            word_freq = dict(Counter(row.split()))
            # for every unique word in the document
            for word, freq in word_freq.items():  # for each unique word in the review.                
                if len(word) < 2:
                    continue
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the frequency of the word
                    values.append(term_freq(word,row)* inv_doc_freq(word, dataset))
        return csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    else:
        print("you need to pass list of strings")

In [7]:
vocab=fit(data)

In [8]:
X = transform(data,vocab)

100%|██████████| 746/746 [00:05<00:00, 147.86it/s]


In [9]:
type(X[0].toarray())

numpy.ndarray

In [10]:
X[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [11]:
type(X[0])

scipy.sparse.csr.csr_matrix

In [12]:
X[0]

<1x2886 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

## Task 2: Top 100 words with highest idf

In [13]:
def fit(dataset): 
    idfvalues=dict()
    for row in dataset :
        for word in row.split():
            idf=inv_doc_freq(word, dataset)
            idfvalues[word]=idf
            
    list_f_tuples = [(k,v) for k,v in idfvalues.items()]
    
    list_f_tuples.sort(key=lambda x: x[1], reverse=True)
    
    unique_words = set()
    for word_idf in list_f_tuples[:100]:
        unique_words.add(word_idf[0])
    
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}
    
    return vocab

In [14]:
vocab=fit(data)

In [15]:
X= transform(data, vocab)

100%|██████████| 746/746 [00:00<00:00, 9117.47it/s]


In [16]:
type(X[0].toarray())

numpy.ndarray

In [17]:
X[0].toarray()

array([[0.7401973, 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.7401973, 0.       , 0.       ,
        0.7401973, 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.      

In [18]:
type(X[0])

scipy.sparse.csr.csr_matrix

In [19]:
X[0]

<1x100 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>