<a href="https://colab.research.google.com/github/abpopal/Resume_Recommendation_Classification/blob/main/Implementing_TFIDF_vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# # TASK: 1 - Build an TF-IDF Vectorizer

In [None]:
corpus = [
          'this is the first document',
          'this document is the second document',
          'and this is the third one',
          'is this the first document',
         ]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import os
import operator
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from sklearn.preprocessing import normalize

In [None]:
def fit(dataset):
  unique_words = set()
  for sentence in dataset: 
    for word in sentence.split(" "):
      if len(word)<2:
        continue
      unique_words.add(word)
  unique_words = sorted(list(unique_words))
  vocab = {j:i for i,j in enumerate(unique_words)}

  return vocab


In [None]:
vocab = fit(corpus)

In [None]:
vocab

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}

In [None]:
from math import log
from collections import Counter
from scipy.sparse import csr_matrix

def counter(dataset,word):
  count = 0
  for sentence in dataset:
    if word in sentence:
      count = count + 1
  return count

def transform(dataset,vocab):
  rows = []
  columns = []
  values = []
  
  for idx, row in enumerate(tqdm(dataset)):
    word_freq = dict(Counter(row.split()))
    for word, freq in word_freq.items():       
      if len(word) < 2: 
        continue
      col_index = vocab.get(word, -1)
      if col_index !=-1:
        rows.append(idx)
        columns.append(col_index)

        tf = freq / len(row.split()) 
        idf = 1 + ( log ( (1 + len(dataset)) / (1 + counter(dataset,word)) ) )
        val = tf * idf
        values.append(val)  

  l = csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
  k = normalize(l,norm='l2')

  return k



In [None]:
tf_idf_vector = transform(corpus,vocab)

100%|██████████| 4/4 [00:00<00:00, 10936.91it/s]


In [None]:
print(tf_idf_vector[0])

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


# TASK: 2 - Compare with TFIDFVectorizer()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tf_idf_vectorizer = TfidfVectorizer()
method_tfidf = tf_idf_vectorizer.fit_transform(corpus)
method_tfidf.get_shape()

print(method_tfidf[0])

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483


# TASK: 3 - Consider only 100 words with top IDF scores

In [None]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_strings to cleaned_strings


In [None]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

In [None]:
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [None]:
corpus

['slow moving aimless movie distressed drifting young man',
 'not sure lost flat characters audience nearly half walked',
 'attempting artiness black white clever camera angles movie disappointed became even ridiculous acting poor plot lines almost non existent',
 'little music anything speak',
 'best scene movie gerardo trying find song keeps running head',
 'rest movie lacks art charm meaning emptiness works guess empty',
 'wasted two hours',
 'saw movie today thought good effort good messages kids',
 'bit predictable',
 'loved casting jimmy buffet science teacher',
 'baby owls adorable',
 'movie showed lot florida best made look appealing',
 'songs best muppets hilarious',
 'cool',
 'right case movie delivers everything almost right face',
 'average acting main person low budget clearly see',
 'review long overdue since consider tale two sisters single greatest film ever made',
 'put gem movie terms screenplay cinematography acting post production editing directing aspect film makin

In [None]:
def fit(dataset):
  unique_words = set()
  for sentence in dataset: 
    for word in sentence.split(" "):
      if len(word)<2:
        continue
      unique_words.add(word)
  unique_words = sorted(list(unique_words))
  vocab = {j:i for i,j in enumerate(unique_words[:100])}

  return vocab


In [None]:
vocab = fit(corpus)

In [None]:
vocab

{'aailiyah': 0,
 'abandoned': 1,
 'ability': 2,
 'abroad': 3,
 'absolutely': 4,
 'abstruse': 5,
 'abysmal': 6,
 'academy': 7,
 'accents': 8,
 'accessible': 9,
 'acclaimed': 10,
 'accolades': 11,
 'accurate': 12,
 'accurately': 13,
 'accused': 14,
 'achievement': 15,
 'achille': 16,
 'ackerman': 17,
 'act': 18,
 'acted': 19,
 'acting': 20,
 'action': 21,
 'actions': 22,
 'actor': 23,
 'actors': 24,
 'actress': 25,
 'actresses': 26,
 'actually': 27,
 'adams': 28,
 'adaptation': 29,
 'add': 30,
 'added': 31,
 'addition': 32,
 'admins': 33,
 'admiration': 34,
 'admitted': 35,
 'adorable': 36,
 'adrift': 37,
 'adventure': 38,
 'advise': 39,
 'aerial': 40,
 'aesthetically': 41,
 'affected': 42,
 'affleck': 43,
 'afraid': 44,
 'africa': 45,
 'afternoon': 46,
 'age': 47,
 'aged': 48,
 'ages': 49,
 'ago': 50,
 'agree': 51,
 'agreed': 52,
 'aimless': 53,
 'air': 54,
 'aired': 55,
 'akasha': 56,
 'akin': 57,
 'alert': 58,
 'alexander': 59,
 'alike': 60,
 'allison': 61,
 'allow': 62,
 'allowing': 

In [None]:
len(vocab)

100

In [None]:
def counter(dataset,word):
  count = 0
  for sentence in dataset:
    if word in sentence:
      count = count + 1
  return count

def transform(dataset,vocab):
  rows = []
  columns = []
  values = []
  
  for idx, row in enumerate(tqdm(dataset)):
    word_freq = dict(Counter(row.split()))
    for word, freq in word_freq.items():       
      if len(word) < 2: 
        continue
      col_index = vocab.get(word, -1)
      if col_index !=-1:
        rows.append(idx)
        columns.append(col_index)

        tf = freq / len(row.split()) 
        idf = 1 + ( log ( (1 + len(dataset)) / (1 + counter(dataset,word)) ) )
        val = tf * idf
        values.append(val)  

  l = csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
  k = normalize(l,norm='l2')

  return k

In [None]:
tf_idf_vector = transform(corpus,vocab)

100%|██████████| 746/746 [00:00<00:00, 5613.22it/s]


In [None]:
tf_idf_vec