In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# distance matrix is test x train 
def knn(k, distance_matrix, train_labels, test_labels):

    predicted_labels = []       # List of predicted labels for test set 
    for i in range(distance_matrix.shape[0]):
          
        # Store indices of sorted neighbors
        neighbors = np.argsort(distance_matrix[i])
            
        # Store labels of k nearest neighbors
        k_nearest_labels = []   
        for j in range(k):
            k_nearest_labels.append(train_labels[neighbors[j]])

        # Store value of the max no. of k_nearest_labels
        predicted_labels.append(max(k_nearest_labels, key=k_nearest_labels.count))

    # Print accuracy
    if type(test_labels).__name__ == 'ndarray':
        print('k=%d:' % (k), str(round(accuracy_score(test_labels, np.asarray(predicted_labels)) * 100, 2)), '%')
    else:
        try:
            print('k=%d:' % (k), str(round(accuracy_score(test_labels, predicted_labels) * 100, 2)), '%')
        except:
            print('ndarray and list are the only accepted types for test_labels')

In [3]:
# distance matrix is test x train 
def find_k_nearest_neighbors(idx, k, distance_matrix, train_labels, test_labels):

    # Store indices of sorted neighbors
    neighbors = np.argsort(distance_matrix[idx])

    # Store labels of k nearest neighbors
    k_nearest_labels = []   
    for j in range(k):
        k_nearest_labels.append(train_labels[neighbors[j]])

    # Store value of the max no. of k_nearest_labels
    predicted_label = max(k_nearest_labels, key=k_nearest_labels.count)

    print('\nLabels of neighbors')
    print(k_nearest_labels)
    print('Predicted Label')
    print(predicted_label)
    print('Correct Label')
    print(test_labels[idx])

In [4]:
# Fetch data
ng_train = fetch_20newsgroups(subset='train')
ng_test = fetch_20newsgroups(subset='test')

# Training and testing data
X_train = ng_train.data
X_test = ng_test.data

# Set of label names for training and testing data
# ng_train.target_names
# ng_test.target_names

# Training and testing labels, where each label is associated with a number corresponding to the index in target_names
# ng_train.target
# ng_test.target

# Training and testing labels
y_train = []
y_test = []

for i in range(len(X_train)):
    y_train.append(ng_train.target_names[ng_train.target[i]])
for i in range(len(X_test)):
    y_test.append(ng_test.target_names[ng_test.target[i]])

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

11314
7532
11314
7532


In [5]:
# Converting text to vectors
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

(11314, 130107)
(7532, 130107)


In [6]:
# Euclidean distance matrix
ng_euclidean_distances = euclidean_distances(vect_X_test, vect_X_train)
print(ng_euclidean_distances.shape)

(7532, 11314)


In [7]:
# Cosine distance matrix
ng_cosine_distances = cosine_distances(vect_X_test, vect_X_train)
print(ng_cosine_distances.shape)

(7532, 11314)


In [8]:
k_list = [3, 5, 10]

print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    knn(k, distance_matrix=ng_euclidean_distances, train_labels=y_train, test_labels=y_test)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    knn(k, distance_matrix=ng_cosine_distances, train_labels=y_train, test_labels=y_test)

-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
k=3: 67.91 %
k=5: 67.55 %
k=10: 66.8 %
-------------------------------------------------
Using Cosine distances ....
-------------------------------------------------
k=3: 67.91 %
k=5: 67.55 %
k=10: 66.8 %


In [9]:
k_list = [5]
idx = np.random.randint(1, len(X_test)+1)   # Random index from train data

print('-------------------------------------------------')
print('The text ....')
print('-------------------------------------------------')
print(X_test[idx])

print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    find_k_nearest_neighbors(idx, k, distance_matrix=ng_euclidean_distances, train_labels=y_train, test_labels=y_test)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    find_k_nearest_neighbors(idx, k, distance_matrix=ng_cosine_distances, train_labels=y_train, test_labels=y_test)

-------------------------------------------------
The text ....
-------------------------------------------------
From: berryh@huey.udel.edu (John Berryhill, Ph.D.)
Subject: Re: The earth also pollutes......
Nntp-Posting-Host: huey.udel.edu
Organization: little scraps of paper, mostly
Lines: 13


People *die* of natural causes, too.  We hear all this bellyaching over
things like murder and war while Mother Nature is killing people all of
the time.

In fact, more people die of natural causes than due to the conscious
actions of other people.  So, what's a few murders here and there?


-- 

                                              John Berryhill


-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------

Labels of neighbors
['sci.med', 'sci.electronics', 'sci.electronics', 'talk.politics.misc', 'alt.atheism']
Predicted Label
sci.electronics
Correct Label
talk.politics.misc
-------------------------------------