### Input

In [120]:
import numpy as np
from collections import Counter

In [121]:
inp_sentences = '''I am an Angel. My name is Angelina. I look good on Sunday. I look ordinary on weekdays. \
I am into Programming. I am a good person'''

In [122]:
print(inp_sentences)

I am an Angel. My name is Angelina. I look good on Sunday. I look ordinary on weekdays. I am into Programming. I am a good person


In [49]:
# split into individual docs on the basis of '.'
docs = inp_sentences.split('.')
docs = [doc.strip().lower() for doc in docs]

In [50]:
# These are the sentences(documents) for which we need to calculate tfidf, i.e., generate numeric features
docs

['i am an angel',
 'my name is angelina',
 'i look good on sunday',
 'i look ordinary on weekdays',
 'i am into programming',
 'i am a good person']

### Preparing vocabulary(features)

In [57]:
# unqiue words
vocab = list()
for i in inp_sentences.split():
    i = i.replace('.', '').lower()
    if i not in vocab and i not in [' ', '.']:
        vocab.append(i)
vocab

['i',
 'am',
 'an',
 'angel',
 'my',
 'name',
 'is',
 'angelina',
 'look',
 'good',
 'on',
 'sunday',
 'ordinary',
 'weekdays',
 'into',
 'programming',
 'a',
 'person']

In [119]:
# for each sentence we will generate the word vectors, following the final output shape will be where each cell will consist of tfidf score:
print(str(len(docs)) + "*" + str(len(vocab)))

6*18


### Calculating Term Frequency

In [134]:
class TfIdfGenerator:
    def __init__(self, docs):
        self.docs = docs
        self.vocab = list()
        self.tfidf = None
        
    def prepare_vocab(self):
        for doc in self.docs:
            for i in doc.split():
                i = i.replace('.', '').lower()
                if i not in self.vocab and i not in [' ', '.']:
                    self.vocab.append(i)
    
    def get_tfidf(self):
        # generate vocab of unique words
        self.prepare_vocab()
        
        # Term-frequency = (# of times in current doc)/(# of terms in the current doc)
        tf = np.zeros((len(self.docs), len(self.vocab)))
        for i in range(len(self.docs)):
            doc_len = len(self.docs[i])
            count_dict = Counter(self.docs[i].split())
            for j in range(len(self.vocab)):
                tf[i][j] = count_dict[self.vocab[j]]/doc_len
        
        # inverse document frequency = log(# of docs/ # of docs in which term appears)
        total_docs = len(self.docs)
        idf = np.zeros(len(self.vocab))

        for j in range(len(self.vocab)):
            word_count = 0
            for i in range(len(self.docs)):
                count_dict = Counter(self.docs[i].split())
                if self.vocab[j] in count_dict:
                    word_count += 1
            idf[j] = np.log(total_docs/word_count)   
        
        
        # calculating tfidf
        self.tfidf = tf * idf
        return self.tfidf
    
    def get_feature_names(self):
        return self.vocab

In [135]:
TfIdf = TfIdfGenerator(docs)

In [138]:
TfIdf.get_tfidf()

array([[0.01402474, 0.05331901, 0.13782765, 0.13782765, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.09430313,
        0.09430313, 0.09430313, 0.09430313, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.00868198, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.05231487, 0.05231487,
        0.05231487, 0.08532188, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.00675265, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.04068934, 0.        ,
        0.04068934, 0.        , 0.06636146, 0.06636146, 0.        ,
        0.        , 0.        , 0.        ],
    

In [139]:
TfIdf.get_feature_names()

['i',
 'am',
 'an',
 'angel',
 'my',
 'name',
 'is',
 'angelina',
 'look',
 'good',
 'on',
 'sunday',
 'ordinary',
 'weekdays',
 'into',
 'programming',
 'a',
 'person']

In [116]:
tf

array([[0.07692308, 0.07692308, 0.07692308, 0.07692308, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.05263158,
        0.05263158, 0.05263158, 0.05263158, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.04761905, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.04761905, 0.04761905,
        0.04761905, 0.04761905, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.03703704, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.03703704, 0.        ,
        0.03703704, 0.        , 0.03703704, 0.03703704, 0.        ,
        0.        , 0.        , 0.        ],
    

In [102]:
idf

array([0.18232156, 0.69314718, 1.79175947, 1.79175947, 1.79175947,
       1.79175947, 1.79175947, 1.79175947, 1.09861229, 1.09861229,
       1.09861229, 1.79175947, 1.79175947, 1.79175947, 1.79175947,
       1.79175947, 1.79175947, 1.79175947])