# Bag of Words

## Dataset

In [56]:
corpus = [
    'Linux has been around since the mid-1990s',
    'Linux distributions include the linux kernel',
    'Linux is one of the most prominent open-source software'
]

corpus

['Linux has been around since the mid-1990s',
 'Linux distributions include the linux kernel',
 'Linux is one of the most prominent open-source software']

## Bag of Words model dengan CountVectorizer

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer()
vectorized_X = vectorizer.fit_transform(corpus).toarray().reshape(1, -1)
vectorized_X

array([[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]], dtype=int64)

In [50]:
vectorizer.get_feature_names_out().reshape(-1, 1)

array([['1990s'],
       ['around'],
       ['been'],
       ['distributions'],
       ['has'],
       ['include'],
       ['is'],
       ['kernel'],
       ['linux'],
       ['mid'],
       ['most'],
       ['of'],
       ['one'],
       ['open'],
       ['prominent'],
       ['since'],
       ['software'],
       ['source'],
       ['the']], dtype=object)

## Euclidean Distance untuk mengukur kedekatan/jarak antar dokumen(vector)

In [61]:
from sklearn.metrics.pairwise import euclidean_distances

for i in range(len(vectorized_X)):
    for j in range(i, len(vectorized_X)):
        if i == j:
            continue
        jarak = euclidean_distances(vectorized_X[i], vectorized_X[j])
        print(f'Jarak dokumen {i+1} dan {j+1}: {jarak}')

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

# Contoh korpus teks
corpus = [
    'Linux has been around since the mid-1990s',
    'Linux distributions include the linux kernel',
    'Linux is one of the most prominent open-source software'
]

# Membuat objek CountVectorizer
vectorizer = CountVectorizer()

# Mengubah korpus teks menjadi matriks fitur (2D array)
vectorized_X = vectorizer.fit_transform(corpus).toarray()

# Menghitung dan mencetak jarak Euclidean antara dokumen
for i in range(len(vectorized_X)):
    for j in range(i + 1, len(vectorized_X)):  # Menggunakan range(i + 1, ...) untuk menghindari pengulangan
        jarak = euclidean_distances([vectorized_X[i]], [vectorized_X[j]])
        print(f'Jarak dokumen {i+1} dan {j+1}: {jarak[0][0]}')


Jarak dokumen 1 dan 2: 3.1622776601683795
Jarak dokumen 1 dan 3: 3.7416573867739413
Jarak dokumen 2 dan 3: 3.4641016151377544


In [63]:
vectorizer.get_feature_names_out().reshape(-1, 1)

array([['1990s'],
       ['around'],
       ['been'],
       ['distributions'],
       ['has'],
       ['include'],
       ['is'],
       ['kernel'],
       ['linux'],
       ['mid'],
       ['most'],
       ['of'],
       ['one'],
       ['open'],
       ['prominent'],
       ['since'],
       ['software'],
       ['source'],
       ['the']], dtype=object)

## Stop Words Filtering dengan Count Vectorizer

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorized_X = vectorizer.fit_transform(corpus).todense()
vectorized_X

matrix([[1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 1, 1, 1, 2, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [67]:
vectorizer.get_feature_names_out().reshape(-1, 1)

array([['1990s'],
       ['distributions'],
       ['include'],
       ['kernel'],
       ['linux'],
       ['mid'],
       ['open'],
       ['prominent'],
       ['software'],
       ['source']], dtype=object)