# Pembobotan menggunakan TF-IDF

In [1]:
import nltk
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os
import math

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Memuat dokumen

Setiap dokumen berasal dari file `.txt` yang berisi teks data berita dalam bahas Inggris. `docs` menampung semua dokumen yang telah dimuat.

In [3]:
folder = '/gdrive/My Drive/Colab Notebooks/dataset/txt'
filenames = [
    'Ukraine ammunition depot reportedly hit in wave of Russian missile attacks.txt',
    'More than 20,000 Russian soldiers killed in five months in Ukraine, US says.txt',
    'Dutch police arrest fake ‘Boris Johnson’ for suspected drink-driving.txt'
]

docs = []
for name in filenames:
    src = os.path.join(folder, name)
    file = open(src, 'r')
    docs.append(file.read())
    file.close()

In [4]:
print('docs:')
for d in docs:
    print(repr(d))

docs:
'Ukraine ammunition depot reportedly hit in wave of Russian missile attacks\n\nRussian missile strikes have injured 34 civilians and apparently damaged railway infrastructure and an ammunition depot in south-eastern Ukraine, hours before an explosion inside Russia derailed a freight train.\n\nThe attacks on both sides of the border on Monday apparently aimed to disrupt military logistics before a significant Ukrainian counteroffensive against occupying Russian troops, expected to start shortly in the south or the east.\n\nThe Russian strike in the Ukrainian city of Pavlohrad was part of the second wave of missile attacks in just three days; on Friday, 23 people were killed when a missile hit an apartment block in central Uman city, and a woman and her daughter died in Dnipro.\n\nWith Kyiv’s allies saying that equipment and newly trained troops promised for the next Ukrainian campaign are in place, Moscow has revived its winter tactics of attempting to orchestrate bombing campaign

## Mengubah teks menjadi token

Semua teks yang terkandung di dalam dokumen akan diubah menjadi token. Token diperoleh dari pemotongan teks berdasarkan karakter spasi, dengan demikian satu token mewakili satu kata di dalam teks. Adapun metode lemmatization yang diterapkan kepada token-token tersebut untuk mengubah bentuk kata ke kata dasar agar lebih relevan.

In [5]:
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    sentences = text.split('.')
    sentences = [re.sub(r"[^a-zA-Z ]", ' ', s) for s in sentences]
    tokens = [t for t in ' '.join(sentences).split(' ') if len(t) > 0]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t.lower() for t in tokens]
    return tokens

In [6]:
def vocab_from_tokenized_docs(docs):
    vocab = []
    for d in docs:
        vocab.extend(d)
    return list(dict.fromkeys(vocab))

In [7]:
docs = [tokenize(d) for d in docs]

In [8]:
print('docs:')
for d in docs:
    print(d)

docs:
['ukraine', 'ammunition', 'depot', 'reportedly', 'hit', 'wave', 'russian', 'missile', 'attack', 'russian', 'missile', 'strike', 'injured', 'civilian', 'apparently', 'damaged', 'railway', 'infrastructure', 'ammunition', 'depot', 'south', 'eastern', 'ukraine', 'hour', 'explosion', 'inside', 'russia', 'derailed', 'freight', 'train', 'the', 'attack', 'side', 'border', 'monday', 'apparently', 'aimed', 'disrupt', 'military', 'logistics', 'significant', 'ukrainian', 'counteroffensive', 'occupying', 'russian', 'troop', 'expected', 'start', 'shortly', 'south', 'east', 'the', 'russian', 'strike', 'ukrainian', 'city', 'pavlohrad', 'part', 'second', 'wave', 'missile', 'attack', 'three', 'day', 'friday', 'people', 'killed', 'missile', 'hit', 'apartment', 'block', 'central', 'uman', 'city', 'woman', 'daughter', 'died', 'dnipro', 'with', 'kyiv', 'ally', 'saying', 'equipment', 'newly', 'trained', 'troop', 'promised', 'next', 'ukrainian', 'campaign', 'place', 'moscow', 'revived', 'winter', 'tacti

## Implementasi TF-IDF

Berikut ini implementasi TF-IDF untuk memberikan bobot terhadap token yang telah diperoleh.

### Term frequence

\begin{align}
    w_{t,d} = tf_{t,d}
\end{align}

Di mana $tf_{t,d} = $ jumlah kemunculan $t$ di dalam $d$.

In [9]:
def term_freq(terms, docs):
    freq = {}
    for t in vocab_from_tokenized_docs(docs):
        freq[t] = 0
    for t in terms:
        freq[t] += 1
    return freq

In [10]:
w_td = [term_freq(d, docs) for d in docs]

In [11]:
print('w_td:')
for tf in w_td:
    print(tf)

w_td:
{'ukraine': 11, 'ammunition': 4, 'depot': 4, 'reportedly': 1, 'hit': 5, 'wave': 2, 'russian': 10, 'missile': 8, 'attack': 6, 'strike': 5, 'injured': 2, 'civilian': 1, 'apparently': 2, 'damaged': 2, 'railway': 3, 'infrastructure': 2, 'south': 2, 'eastern': 1, 'hour': 2, 'explosion': 2, 'inside': 1, 'russia': 7, 'derailed': 3, 'freight': 3, 'train': 4, 'the': 6, 'side': 2, 'border': 3, 'monday': 3, 'aimed': 2, 'disrupt': 1, 'military': 4, 'logistics': 1, 'significant': 2, 'ukrainian': 9, 'counteroffensive': 2, 'occupying': 1, 'troop': 3, 'expected': 1, 'start': 1, 'shortly': 1, 'east': 1, 'city': 3, 'pavlohrad': 3, 'part': 1, 'second': 2, 'three': 1, 'day': 2, 'friday': 2, 'people': 1, 'killed': 2, 'apartment': 2, 'block': 1, 'central': 1, 'uman': 2, 'woman': 2, 'daughter': 1, 'died': 1, 'dnipro': 2, 'with': 1, 'kyiv': 4, 'ally': 2, 'saying': 1, 'equipment': 2, 'newly': 1, 'trained': 1, 'promised': 1, 'next': 1, 'campaign': 2, 'place': 1, 'moscow': 2, 'revived': 1, 'winter': 1, 'ta

### Log frequency

\begin{align}
w_{t,d} = \left\{
\begin{array}{cl}
1 + log(tf_{t,d}) & tf_{t,d} > 0 \\
0
\end{array}
\right.
\end{align}

In [12]:
def log_freq(tf):
    freq = {}
    for t,f in tf.items():
        if f > 0:
            freq[t] = 1 + math.log(f, 10)
        else:
            freq[t] = 0
    return freq

In [13]:
w_td = [log_freq(tf) for tf in w_td]

In [14]:
print('w_td:')
for tf in w_td:
    print(tf)

w_td:
{'ukraine': 2.041392685158225, 'ammunition': 1.6020599913279623, 'depot': 1.6020599913279623, 'reportedly': 1.0, 'hit': 1.6989700043360187, 'wave': 1.3010299956639813, 'russian': 2.0, 'missile': 1.9030899869919433, 'attack': 1.7781512503836434, 'strike': 1.6989700043360187, 'injured': 1.3010299956639813, 'civilian': 1.0, 'apparently': 1.3010299956639813, 'damaged': 1.3010299956639813, 'railway': 1.4771212547196624, 'infrastructure': 1.3010299956639813, 'south': 1.3010299956639813, 'eastern': 1.0, 'hour': 1.3010299956639813, 'explosion': 1.3010299956639813, 'inside': 1.0, 'russia': 1.8450980400142567, 'derailed': 1.4771212547196624, 'freight': 1.4771212547196624, 'train': 1.6020599913279623, 'the': 1.7781512503836434, 'side': 1.3010299956639813, 'border': 1.4771212547196624, 'monday': 1.4771212547196624, 'aimed': 1.3010299956639813, 'disrupt': 1.0, 'military': 1.6020599913279623, 'logistics': 1.0, 'significant': 1.3010299956639813, 'ukrainian': 1.9542425094393248, 'counteroffensiv

### Document frequency

In [15]:
def doc_freq(w_td):
    freq = {}
    for t in w_td[0]:
        freq[t] = 0
    for tf in w_td:
        for t,f in tf.items():
            if f > 0:
                freq[t] += 1
    return freq

In [16]:
df = doc_freq(w_td)

In [17]:
print('df:', df)

df: {'ukraine': 3, 'ammunition': 2, 'depot': 2, 'reportedly': 2, 'hit': 2, 'wave': 2, 'russian': 2, 'missile': 2, 'attack': 2, 'strike': 2, 'injured': 2, 'civilian': 1, 'apparently': 1, 'damaged': 1, 'railway': 1, 'infrastructure': 1, 'south': 1, 'eastern': 2, 'hour': 1, 'explosion': 1, 'inside': 2, 'russia': 3, 'derailed': 1, 'freight': 1, 'train': 1, 'the': 3, 'side': 2, 'border': 1, 'monday': 3, 'aimed': 1, 'disrupt': 1, 'military': 2, 'logistics': 1, 'significant': 2, 'ukrainian': 3, 'counteroffensive': 1, 'occupying': 1, 'troop': 2, 'expected': 1, 'start': 1, 'shortly': 2, 'east': 1, 'city': 3, 'pavlohrad': 1, 'part': 1, 'second': 2, 'three': 2, 'day': 2, 'friday': 1, 'people': 2, 'killed': 2, 'apartment': 1, 'block': 1, 'central': 1, 'uman': 1, 'woman': 1, 'daughter': 1, 'died': 1, 'dnipro': 1, 'with': 1, 'kyiv': 2, 'ally': 2, 'saying': 2, 'equipment': 1, 'newly': 1, 'trained': 1, 'promised': 1, 'next': 1, 'campaign': 2, 'place': 1, 'moscow': 2, 'revived': 1, 'winter': 1, 'tactic

### Inverse document frequency

\begin{align}
    idf = log\left(\frac{N}{df}\right)
\end{align}

In [18]:
def inverse_doc_freq(df, N):
    freq = {}
    for t,f in df.items():
        freq[t] = math.log(N/f, 10)
    return freq

In [19]:
idf = inverse_doc_freq(df, len(docs))

In [20]:
print('idf:', idf)

idf: {'ukraine': 0.0, 'ammunition': 0.17609125905568124, 'depot': 0.17609125905568124, 'reportedly': 0.17609125905568124, 'hit': 0.17609125905568124, 'wave': 0.17609125905568124, 'russian': 0.17609125905568124, 'missile': 0.17609125905568124, 'attack': 0.17609125905568124, 'strike': 0.17609125905568124, 'injured': 0.17609125905568124, 'civilian': 0.47712125471966244, 'apparently': 0.47712125471966244, 'damaged': 0.47712125471966244, 'railway': 0.47712125471966244, 'infrastructure': 0.47712125471966244, 'south': 0.47712125471966244, 'eastern': 0.17609125905568124, 'hour': 0.47712125471966244, 'explosion': 0.47712125471966244, 'inside': 0.17609125905568124, 'russia': 0.0, 'derailed': 0.47712125471966244, 'freight': 0.47712125471966244, 'train': 0.47712125471966244, 'the': 0.0, 'side': 0.17609125905568124, 'border': 0.47712125471966244, 'monday': 0.0, 'aimed': 0.47712125471966244, 'disrupt': 0.47712125471966244, 'military': 0.17609125905568124, 'logistics': 0.47712125471966244, 'significa

### TF-IDF

\begin{align}
    w_{t,d} = tf \cdot idf
\end{align}

In [21]:
def tf_idf(tf, idf):
    return {t: tf[t] * idf[t] for t in tf}

In [22]:
w_td = [tf_idf(tf, idf) for tf in w_td]

In [23]:
print('w_td:')
for tf in w_td:
    print(tf)

w_td:
{'ukraine': 0.0, 'ammunition': 0.28210876095567466, 'depot': 0.28210876095567466, 'reportedly': 0.17609125905568124, 'hit': 0.29917376716136573, 'wave': 0.22910001000567795, 'russian': 0.3521825181113625, 'missile': 0.33511751190567135, 'attack': 0.31311689247148966, 'strike': 0.29917376716136573, 'injured': 0.22910001000567795, 'civilian': 0.47712125471966244, 'apparently': 0.6207490639591157, 'damaged': 0.6207490639591157, 'railway': 0.7047659464249274, 'infrastructure': 0.6207490639591157, 'south': 0.6207490639591157, 'eastern': 0.17609125905568124, 'hour': 0.6207490639591157, 'explosion': 0.6207490639591157, 'inside': 0.17609125905568124, 'russia': 0.0, 'derailed': 0.7047659464249274, 'freight': 0.7047659464249274, 'train': 0.7643768731985688, 'the': 0.0, 'side': 0.22910001000567795, 'border': 0.7047659464249274, 'monday': 0.0, 'aimed': 0.6207490639591157, 'disrupt': 0.47712125471966244, 'military': 0.28210876095567466, 'logistics': 0.47712125471966244, 'significant': 0.22910

In [24]:
freq = {t: [] for t in w_td[0]}
for t in freq:
    for tf in w_td:
        freq[t].append(tf[t])
freq

{'ukraine': [0.0, 0.0, 0.0],
 'ammunition': [0.28210876095567466, 0.17609125905568124, 0.0],
 'depot': [0.28210876095567466, 0.17609125905568124, 0.0],
 'reportedly': [0.17609125905568124, 0.17609125905568124, 0.0],
 'hit': [0.29917376716136573, 0.22910001000567795, 0.0],
 'wave': [0.22910001000567795, 0.17609125905568124, 0.0],
 'russian': [0.3521825181113625, 0.3521825181113625, 0.0],
 'missile': [0.33511751190567135, 0.17609125905568124, 0.0],
 'attack': [0.31311689247148966, 0.22910001000567795, 0.0],
 'strike': [0.29917376716136573, 0.17609125905568124, 0.0],
 'injured': [0.22910001000567795, 0.260108141521493, 0.0],
 'civilian': [0.47712125471966244, 0.0, 0.0],
 'apparently': [0.6207490639591157, 0.0, 0.0],
 'damaged': [0.6207490639591157, 0.0, 0.0],
 'railway': [0.7047659464249274, 0.0, 0.0],
 'infrastructure': [0.6207490639591157, 0.0, 0.0],
 'south': [0.6207490639591157, 0.0, 0.0],
 'eastern': [0.17609125905568124, 0.22910001000567795, 0.0],
 'hour': [0.6207490639591157, 0.0, 