In [1]:
from collections import Counter
import re
import pandas as pd
import math

In [2]:
# Data contoh
document = "The quick brown fox jumps over the lazy dog. The lazy dog was tired."

# 1. TF (Term Frequency)

TF mengukur seberapa sering suatu kata muncul dalam sebuah dokumen dibandingkan dengan jumlah kata total dalam dokumen tersebut.

In [5]:
# 1. Tokenisasi teks
# Mengubah teks menjadi huruf kecil dan membuang tanda baca
tokens = re.findall(r'\b\w+\b', document.lower())
tokens

['the',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'the',
 'lazy',
 'dog',
 'the',
 'lazy',
 'dog',
 'was',
 'tired']

In [6]:
# 2. Hitung frekuensi kemunculan setiap kata
word_counts = Counter(tokens)
word_counts

Counter({'the': 3,
         'lazy': 2,
         'dog': 2,
         'quick': 1,
         'brown': 1,
         'fox': 1,
         'jumps': 1,
         'over': 1,
         'was': 1,
         'tired': 1})

In [7]:
counter_dict = dict(word_counts)
counter_dict

{'the': 3,
 'quick': 1,
 'brown': 1,
 'fox': 1,
 'jumps': 1,
 'over': 1,
 'lazy': 2,
 'dog': 2,
 'was': 1,
 'tired': 1}

In [8]:
df = pd.DataFrame(counter_dict.items(), columns=['Word', 'Frequency'])
df['Total'] = len(tokens)
df['tf'] = df['Frequency']/df['Total']

In [9]:
df

Unnamed: 0,Word,Frequency,Total,tf
0,the,3,14,0.214286
1,quick,1,14,0.071429
2,brown,1,14,0.071429
3,fox,1,14,0.071429
4,jumps,1,14,0.071429
5,over,1,14,0.071429
6,lazy,2,14,0.142857
7,dog,2,14,0.142857
8,was,1,14,0.071429
9,tired,1,14,0.071429


# 2. IDF (Inverse Document Frequency)

IDF mengukur seberapa jarang atau unik sebuah kata di seluruh koleksi dokumen. Tujuan dari IDF adalah untuk memberikan bobot lebih besar pada kata-kata yang jarang muncul di banyak dokumen (karena kata-kata ini dianggap lebih informatif), dan memberi bobot rendah pada kata-kata yang sering muncul di banyak dokumen (seperti kata-kata umum seperti "the", "is", "and", dll.).

In [12]:
# Koleksi dokumen
documents = [
    "the quick brown fox jumps over the lazy dog",
    "the lazy dog was tired",
    "the quick brown dog runs fast"
]

In [13]:
freq_accross_doc = {}
for doc in documents:
    for word in set(doc.split()):
        freq_accross_doc[word] = freq_accross_doc.get(word, 0) + 1

In [14]:
freq_accross_doc

{'fox': 1,
 'the': 3,
 'quick': 2,
 'jumps': 1,
 'brown': 2,
 'dog': 3,
 'lazy': 2,
 'over': 1,
 'tired': 1,
 'was': 1,
 'fast': 1,
 'runs': 1}

In [15]:
df_freq_accross_doc = pd.DataFrame(freq_accross_doc.items(),columns=['Word','freq'])
df_freq_accross_doc['N'] = len(documents)
df_freq_accross_doc['inv_proporsi'] = df_freq_accross_doc['N']/df_freq_accross_doc['freq']
df_freq_accross_doc['idf'] = df_freq_accross_doc['inv_proporsi'].apply(lambda x : math.log(x))
df_freq_accross_doc

Unnamed: 0,Word,freq,N,inv_proporsi,idf
0,fox,1,3,3.0,1.098612
1,the,3,3,1.0,0.0
2,quick,2,3,1.5,0.405465
3,jumps,1,3,3.0,1.098612
4,brown,2,3,1.5,0.405465
5,dog,3,3,1.0,0.0
6,lazy,2,3,1.5,0.405465
7,over,1,3,3.0,1.098612
8,tired,1,3,3.0,1.098612
9,was,1,3,3.0,1.098612


# 3. TF-IDF (Term Frequency - Inverse Document Frequency)

Nilai TF-IDF memberikan gambaran tentang seberapa penting sebuah kata dalam suatu dokumen relatif terhadap seluruh koleksi dokumen. Semakin tinggi nilai TF-IDF suatu kata, semakin penting kata tersebut untuk dokumen tertentu, dengan dua komponen utama yang berpengaruh pada nilai ini:

Term Frequency (TF): Mengukur seberapa sering suatu kata muncul dalam dokumen. Semakin sering suatu kata muncul, semakin tinggi nilai TF-nya.

Inverse Document Frequency (IDF): Mengukur seberapa jarang suatu kata muncul di seluruh koleksi dokumen. Kata yang muncul di banyak dokumen akan memiliki nilai IDF rendah, sedangkan kata yang jarang muncul akan memiliki nilai IDF tinggi.

In [18]:
tokens_doc1 = re.findall(r'\b\w+\b', documents[0].lower())
tokens_doc2 = re.findall(r'\b\w+\b', documents[1].lower())
tokens_doc3 = re.findall(r'\b\w+\b', documents[2].lower())

In [19]:
word_counts_doc1 = Counter(tokens_doc1)
word_counts_doc2 = Counter(tokens_doc2)
word_counts_doc3 = Counter(tokens_doc3)

In [20]:
counter_dict_doc1 = dict(word_counts_doc1)
counter_dict_doc2 = dict(word_counts_doc2)
counter_dict_doc3 = dict(word_counts_doc3)

In [21]:
df_tf_doc1 = pd.DataFrame(counter_dict_doc1.items(), columns=['Word', 'Frequency'])
df_tf_doc2 = pd.DataFrame(counter_dict_doc2.items(), columns=['Word', 'Frequency'])
df_tf_doc3 = pd.DataFrame(counter_dict_doc3.items(), columns=['Word', 'Frequency'])

In [22]:
df_tf_doc1['Total'] = len(tokens_doc1)
df_tf_doc2['Total'] = len(tokens_doc2)
df_tf_doc3['Total'] = len(tokens_doc3)

In [23]:
df_tf_doc1['tf'] = df_tf_doc1['Frequency']/df_tf_doc1['Total']
df_tf_doc2['tf'] = df_tf_doc2['Frequency']/df_tf_doc2['Total']
df_tf_doc3['tf'] = df_tf_doc3['Frequency']/df_tf_doc3['Total']

In [24]:
df_tf_doc1

Unnamed: 0,Word,Frequency,Total,tf
0,the,2,9,0.222222
1,quick,1,9,0.111111
2,brown,1,9,0.111111
3,fox,1,9,0.111111
4,jumps,1,9,0.111111
5,over,1,9,0.111111
6,lazy,1,9,0.111111
7,dog,1,9,0.111111


In [25]:
df_tf_idf_doc1 = pd.merge(df_tf_doc1,df_freq_accross_doc,on='Word',how='inner')
df_tf_idf_doc2 = pd.merge(df_tf_doc2,df_freq_accross_doc,on='Word',how='inner')
df_tf_idf_doc3 = pd.merge(df_tf_doc3,df_freq_accross_doc,on='Word',how='inner')

In [26]:
df_tf_idf_doc1['tf_idf'] = df_tf_idf_doc1['tf']*df_tf_idf_doc1['idf']
df_tf_idf_doc2['tf_idf'] = df_tf_idf_doc2['tf']*df_tf_idf_doc2['idf']
df_tf_idf_doc3['tf_idf'] = df_tf_idf_doc3['tf']*df_tf_idf_doc3['idf']

In [27]:
df_tf_idf_doc1[['Word','tf','idf','tf_idf']]

Unnamed: 0,Word,tf,idf,tf_idf
0,the,0.222222,0.0,0.0
1,quick,0.111111,0.405465,0.045052
2,brown,0.111111,0.405465,0.045052
3,fox,0.111111,1.098612,0.122068
4,jumps,0.111111,1.098612,0.122068
5,over,0.111111,1.098612,0.122068
6,lazy,0.111111,0.405465,0.045052
7,dog,0.111111,0.0,0.0


In [28]:
df_tf_idf_doc2[['Word','tf','idf','tf_idf']]

Unnamed: 0,Word,tf,idf,tf_idf
0,the,0.2,0.0,0.0
1,lazy,0.2,0.405465,0.081093
2,dog,0.2,0.0,0.0
3,was,0.2,1.098612,0.219722
4,tired,0.2,1.098612,0.219722


In [29]:
df_tf_idf_doc3[['Word','tf','idf','tf_idf']]

Unnamed: 0,Word,tf,idf,tf_idf
0,the,0.166667,0.0,0.0
1,quick,0.166667,0.405465,0.067578
2,brown,0.166667,0.405465,0.067578
3,dog,0.166667,0.0,0.0
4,runs,0.166667,1.098612,0.183102
5,fast,0.166667,1.098612,0.183102


# 4. Menghitung via Fungsi

In [31]:
import math
from collections import Counter

# Koleksi dokumen
documents = [
    "the quick brown fox jumps over the lazy dog",
    "the lazy dog was tired",
    "the quick brown dog runs fast"
]

# 1. Hitung Term Frequency (TF)
def compute_tf(doc):
    tf = Counter(doc.split())
    total_words = len(doc.split())
    tf = {word: count / total_words for word, count in tf.items()}
    return tf

# 2. Hitung Inverse Document Frequency (IDF)
def compute_idf(documents):
    N = len(documents)  # Jumlah total dokumen
    idf = {}
    for doc in documents:
        for word in set(doc.split()):
            idf[word] = idf.get(word, 0) + 1
    # IDF(t) = log(N / df(t))
    idf = {word: math.log(N / count) for word, count in idf.items()}
    return idf

# 3. Hitung TF-IDF
def compute_tfidf(documents):
    tfidf = []
    idf = compute_idf(documents)
    for doc in documents:
        tf = compute_tf(doc)
        tfidf_doc = {word: tf.get(word, 0) * idf.get(word, 0) for word in tf.keys()}
        tfidf.append(tfidf_doc)
    return tfidf

# Hitung TF-IDF untuk koleksi dokumen
tfidf = compute_tfidf(documents)

# Menampilkan hasil
for i, doc_tfidf in enumerate(tfidf):
    print(f"Document {i+1}:")
    for word, score in doc_tfidf.items():
        print(f"  {word}: {score:.4f}")

Document 1:
  the: 0.0000
  quick: 0.0451
  brown: 0.0451
  fox: 0.1221
  jumps: 0.1221
  over: 0.1221
  lazy: 0.0451
  dog: 0.0000
Document 2:
  the: 0.0000
  lazy: 0.0811
  dog: 0.0000
  was: 0.2197
  tired: 0.2197
Document 3:
  the: 0.0000
  quick: 0.0676
  brown: 0.0676
  dog: 0.0000
  runs: 0.1831
  fast: 0.1831


# 5. Menghitung via package 

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Koleksi dokumen
documents = [
    "the quick brown fox jumps over the lazy dog",
    "the lazy dog was tired",
    "the quick brown dog runs fast"
]

# Inisialisasi TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fitting dan transformasi dokumen untuk menghitung TF-IDF
X = vectorizer.fit_transform(documents)

# Mengonversi hasil TF-IDF ke dalam DataFrame agar mudah dibaca
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Menampilkan hasil TF-IDF
df_tfidf

Unnamed: 0,brown,dog,fast,fox,jumps,lazy,over,quick,runs,the,tired,was
0,0.298778,0.232028,0.0,0.392857,0.392857,0.298778,0.392857,0.298778,0.0,0.464056,0.0,0.0
1,0.0,0.32631,0.0,0.0,0.0,0.420183,0.0,0.0,0.0,0.32631,0.55249,0.55249
2,0.387376,0.300832,0.509353,0.0,0.0,0.0,0.0,0.387376,0.509353,0.300832,0.0,0.0


# Kesimpulan:
- Semakin tinggi nilai TF-IDF, semakin penting kata tersebut untuk dokumen tertentu karena kata itu jarang muncul di seluruh koleksi dokumen dan sering muncul di dokumen yang sedang dianalisis.
- Semakin rendah nilai TF-IDF, semakin tidak penting kata tersebut, karena kata tersebut lebih sering muncul di banyak dokumen dan tidak memberikan banyak informasi tambahan untuk dokumen tersebut.