In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stemmer.stem("Halo hai ada asiknya kerennya adalah mengapa begitulah pokoknya dilakukan")

'halo hai ada asiknya keren adalah mengapa begitu pokok laku'

In [29]:
def cosine_similarity(a: np.ndarray, b: np.ndarray):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [30]:
def predict_with_knn(new_vector: np.ndarray, tfidf_matrix: np.ndarray, labels: np.ndarray, k=3):
    similarities = [cosine_similarity(new_vector, tfidf_vector) for tfidf_vector in tfidf_matrix]
    top_indices = np.argsort(similarities)[-k:]
    label_votes = [labels[i] for i in top_indices]
    print(label_votes)
    prediction = max(set(label_votes), key=label_votes.count)
    return prediction

In [31]:
documents = [
    "The cat sat on the mat",
    "The dog jumped over the fence",
    "The cat and the dog are friends",
    "Cat and dog live in the same house",
    "They both got angry"
]
labels = ['cat', 'dog', 'both', 'both', 'both']


In [32]:
df = pd.DataFrame(documents, columns=['texts'])

In [33]:
df['labels'] = labels

In [34]:
df.head()

Unnamed: 0,texts,labels
0,The cat sat on the mat,cat
1,The dog jumped over the fence,dog
2,The cat and the dog are friends,both
3,Cat and dog live in the same house,both
4,They both got angry,both


In [39]:
porter = PorterStemmer()
stop_words = set(stopwords.words('english')) # indonesian for Bahasa

word_set = set()
preprocessed_documents = [] 

for document in documents:
    # Tokenize the document
    words = word_tokenize(document.lower())
    
    # Remove stop words and stem the remaining words
    stemmed_words = [porter.stem(word) for word in words if word not in stop_words]
    word_set.update(stemmed_words)
    
    preprocessed_documents.append(' '.join(stemmed_words))

preprocessed_documents

['cat sat mat',
 'dog jump fenc',
 'cat dog friend',
 'cat dog live hous',
 'got angri']

In [42]:
word_to_index = {word: i for i, word in enumerate(word_set)}

tfidf_matrix = np.zeros((len(preprocessed_documents), len(word_set)))

for i, document in enumerate(preprocessed_documents):
    words = document.split()
    word_count = {word: words.count(word) for word in set(words)}
    for word, count in word_count.items():
        tf = count / len(words)
        idf = np.log(len(documents) / (1 + sum(word in document for document in preprocessed_documents)))
        tfidf_matrix[i, word_to_index[word]] = tf * idf

In [43]:
word_to_index

{'jump': 0,
 'friend': 1,
 'cat': 2,
 'sat': 3,
 'hous': 4,
 'dog': 5,
 'mat': 6,
 'got': 7,
 'fenc': 8,
 'live': 9,
 'angri': 10}

In [59]:
tfidf_df = pd.DataFrame(tfidf_matrix, columns=[item[0] for item in list(word_to_index.items())])

In [60]:
tfidf_df

Unnamed: 0,jump,friend,cat,sat,hous,dog,mat,got,fenc,live,angri
0,0.0,0.0,0.074381,0.30543,0.0,0.0,0.30543,0.0,0.0,0.0,0.0
1,0.30543,0.0,0.0,0.0,0.0,0.074381,0.0,0.0,0.30543,0.0,0.0
2,0.0,0.30543,0.074381,0.0,0.0,0.074381,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.055786,0.0,0.229073,0.055786,0.0,0.0,0.0,0.229073,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458145,0.0,0.0,0.458145


In [37]:
input_text = "The cat and the dog are playing together" # Text to predict
new_words = word_tokenize(input_text.lower())
stemmed_new_words = [porter.stem(word) for word in new_words if word not in stop_words]
new_tfidf_vector = np.zeros(len(word_set))

for word in stemmed_new_words:
    if word in word_to_index:
        tf = stemmed_new_words.count(word) / len(stemmed_new_words)
        idf = np.log(len(documents) / (1 + sum(word in document for document in preprocessed_documents)))
        new_tfidf_vector[word_to_index[word]] = tf * idf

In [38]:
predicted_label = predict_with_knn(new_tfidf_vector, tfidf_matrix, labels)

print("Predicted label:", predicted_label)

['dog', 'both', 'both']
Predicted label: both
