In [55]:
import numpy as np
import pandas as pd

## Word Weighting

In [56]:
# Documents
documents = [
    "The cat sat on the mat",
    "The dog jumped over the fence",
    "The cat and the dog are friends",
    "Cat and dog live in the same house",
    "They both got angry"
]

In [57]:
labels = ['cat', 'dog', 'both', 'both', 'both'] # Set the labels for each array

In [58]:
data = pd.DataFrame(documents, columns=['text'])
data.head()

Unnamed: 0,text
0,The cat sat on the mat
1,The dog jumped over the fence
2,The cat and the dog are friends
3,Cat and dog live in the same house
4,They both got angry


In [59]:
data['labels'] = labels

In [60]:
data.head()

Unnamed: 0,text,labels
0,The cat sat on the mat,cat
1,The dog jumped over the fence,dog
2,The cat and the dog are friends,both
3,Cat and dog live in the same house,both
4,They both got angry,both


In [61]:
word_set = set()
for doc in documents:
    words = doc.casefold().split()
    word_set.update(words)

In [62]:
word_set

{'and',
 'angry',
 'are',
 'both',
 'cat',
 'dog',
 'fence',
 'friends',
 'got',
 'house',
 'in',
 'jumped',
 'live',
 'mat',
 'on',
 'over',
 'same',
 'sat',
 'the',
 'they'}

In [63]:
word_to_index = {word: i for i, word in enumerate(word_set)}
word_to_index

{'they': 0,
 'house': 1,
 'both': 2,
 'jumped': 3,
 'and': 4,
 'live': 5,
 'cat': 6,
 'in': 7,
 'angry': 8,
 'sat': 9,
 'are': 10,
 'friends': 11,
 'got': 12,
 'over': 13,
 'the': 14,
 'dog': 15,
 'on': 16,
 'same': 17,
 'fence': 18,
 'mat': 19}

In [64]:
tfidf_matrix = np.zeros((len(documents), len(word_set)))
tfidf_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [65]:
for i, doc in enumerate(documents):
    words = doc.casefold().split()
    word_count = {word: words.count(word) for word in set(words)}
    for word, count in word_count.items():
        tf = count / len(words)
        idf = np.log(len(documents) / (1 + sum(word in document for document in documents)))
        tfidf_matrix[i, word_to_index[word]] = tf * idf

In [66]:
tfidf_matrix

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.0851376 , 0.        , 0.        , 0.15271512,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.15271512, 0.        , 0.        , 0.15271512],
       [0.        , 0.        , 0.        , 0.15271512, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.15271512, 0.        ,
        0.03719059, 0.        , 0.        , 0.15271512, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.07297509,
        0.        , 0.07297509, 0.        , 0.        , 0.        ,
        0.13089868, 0.13089868, 0.        , 0.        , 0.        ,
        0.03187765, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.11453634, 0.        , 0.        , 0.0638532 ,
        0.11453634, 0.0638532 , 0.11453634, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0

## KNN

In [67]:
def cosine_similarity(a: np.ndarray, b: np.ndarray):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [68]:
new_document = "The cat and dog are playing together"

In [69]:
new_words = new_document.casefold().split()

In [70]:
new_word_count = {word: new_words.count(word) for word in set(new_words)}
new_word_count

{'are': 1, 'the': 1, 'cat': 1, 'together': 1, 'dog': 1, 'and': 1, 'playing': 1}

In [71]:
new_tfidf_vector = np.zeros(len(word_set))
new_tfidf_vector

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [72]:
for word, count in new_word_count.items():
    if word in word_to_index:
        tf = count / len(new_words)
        idf = np.log(len(documents) / (1 + sum(word in document for document in documents)))
        new_tfidf_vector[word_to_index[word]] = tf * idf
new_tfidf_vector

array([0.        , 0.        , 0.        , 0.        , 0.07297509,
       0.        , 0.07297509, 0.        , 0.        , 0.        ,
       0.13089868, 0.        , 0.        , 0.        , 0.        ,
       0.03187765, 0.        , 0.        , 0.        , 0.        ])

In [73]:
def predict_with_knn(new_vector: np.ndarray, tfidf_matrix: np.ndarray, labels: np.ndarray, k=3):
    similarities = [cosine_similarity(new_vector, tfidf_vector) for tfidf_vector in tfidf_matrix]
    top_indices = np.argsort(similarities)[-k:]
    label_votes = [labels[i] for i in top_indices]
    print(label_votes)
    prediction = max(set(label_votes), key=label_votes.count)
    return prediction

In [74]:
prediction = predict_with_knn(new_tfidf_vector, tfidf_matrix, data['labels'].to_numpy())

['cat', 'both', 'both']


In [75]:
prediction

'both'