# 1. TF-IDF from Scratch

In [1]:
import numpy as np
import pandas as pd

text_data = ["good movie", "bad movie", "good bad movie"]

# Sentences and Word Tokenization
word_data = [sentence.split(' ') for sentence in text_data]

# Finding the Vocab
vocab = list(set([word for sentence in word_data for word in sentence]))

# Finding Term Frequency
tf_data = []
for sentence in word_data:
    tf_sentence = []
    for word in vocab:
        tf_sentence.append(sentence.count(word)/len(sentence))
    tf_data.append(tf_sentence)
    
# Finding Inverse Document Frequency
n_documents = len(text_data)
idf_data = []

for word in vocab:  
    n_appearances = 0
    for sentence in word_data:
        if word in sentence:
            n_appearances += 1

    idf = np.log(n_documents/n_appearances)
    idf_data.append(idf)  

    
# Finding TF-IDF for each sentence
tfidf_data = []
for tf_sentence in tf_data:
    tfidf_sentence = []
    for tf, idf in zip(tf_sentence, idf_data):
        tfidf_sentence.append(tf*idf)
    tfidf_data.append(tfidf_sentence)
    
df = pd.DataFrame(tfidf_data, columns = vocab)
df['full_sent'] = text_data

df

Unnamed: 0,good,movie,bad,full_sent
0,0.202733,0.0,0.0,good movie
1,0.0,0.0,0.202733,bad movie
2,0.135155,0.0,0.135155,good bad movie


# 2. TF-IDF with Sklearn

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = ["good movie", "bad movie", "good bad movie"]

vec = TfidfVectorizer()
vec.fit(text_data)

df = pd.DataFrame(vec.transform(text_data).toarray(), columns = vec.get_feature_names_out())
df['full_sent'] = text_data

df

Unnamed: 0,bad,good,movie,full_sent
0,0.0,0.789807,0.613356,good movie
1,0.789807,0.0,0.613356,bad movie
2,0.619805,0.619805,0.481334,good bad movie


In [4]:
vec.get_feature_names_out()

array(['bad', 'good', 'movie'], dtype=object)