In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [4]:
from sklearn.datasets import fetch_20newsgroups

# Define the categories
categories = [
    "alt.atheism", 
    "sci.med", 
    "sci.electronics", 
    "comp.graphics", 
    "talk.politics.guns", 
    "sci.crypt"
]

# Fetch the dataset
newsgroups_data = fetch_20newsgroups(
    subset='all', 
    categories=categories, 
    shuffle=True, 
    random_state=42
)


In [5]:
# Extract the data and the target labels
X = newsgroups_data.data
y = newsgroups_data.target


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming X is your text data
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(X)


In [10]:
vectors.shape

(5647, 62583)

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming 'vectors' is your feature matrix and 'y' is your target array

# Calculate the split index
split_index = int(vectors.shape[0] * 0.8)  # 80% for training

# Shuffle the dataset (optional, similar to random_state in train_test_split)
indices = np.arange(vectors.shape[0])
np.random.seed(42)
np.random.shuffle(indices)
vectors_shuffled = vectors[indices]
y_shuffled = y[indices]

# Manually split the dataset
X_train = vectors_shuffled[:split_index]
y_train = y_shuffled[:split_index]
X_test = vectors_shuffled[split_index:]
y_test = y_shuffled[split_index:]

# Output the shape of X_train
print(X_train.shape)


(4517, 62583)


In [15]:
from scipy.sparse import csr_matrix

def manual_cosine_similarity(matrix):
    # Ensure the matrix is in CSR format
    if not isinstance(matrix, csr_matrix):
        matrix = matrix.tocsr()

    # Normalize the matrix rows to unit vectors
    norms = np.sqrt(matrix.power(2).sum(axis=1))
    normalized_matrix = matrix.multiply(1.0 / norms)

    # Compute the cosine similarity as dot product of normalized vectors
    return normalized_matrix @ normalized_matrix.T

# Assuming X_train is a sparse matrix from TfidfVectorizer
cosine_sim_manual = manual_cosine_similarity(X_train)

# Output the cosine similarity matrix
print(cosine_sim_manual)


  (0, 4492)	0.003350647189648428
  (0, 3949)	0.007127312793135944
  (0, 3685)	0.03252263107640067
  (0, 3593)	0.012175881753406528
  (0, 3323)	0.03131988820594425
  (0, 3145)	0.009165790196926593
  (0, 2945)	0.009698642534557557
  (0, 2879)	0.0015630076102292763
  (0, 2697)	0.03518546366335293
  (0, 2293)	0.010475920306978723
  (0, 2002)	0.004776067750334571
  (0, 1260)	0.006319514189957893
  (0, 1257)	0.014915520491031608
  (0, 1069)	0.019427232533151123
  (0, 906)	0.0064815167558430285
  (0, 33)	0.006863852776260209
  (0, 3589)	0.024161220596529115
  (0, 3578)	0.010995453178862798
  (0, 3499)	0.020734466894671025
  (0, 3456)	0.032214599013786864
  (0, 3096)	0.042565932729921103
  (0, 2880)	0.01507003251099234
  (0, 2696)	0.018914592849813797
  (0, 2162)	0.03902415907002439
  (0, 2097)	0.01683361989203393
  :	:
  (4516, 481)	0.14441631958667075
  (4516, 480)	0.17267382990792615
  (4516, 471)	0.09724836671974504
  (4516, 466)	0.11644317825398529
  (4516, 436)	0.032902055533852385
  (45

In [18]:
from sklearn.metrics.pairwise import euclidean_distances

# Assuming X_train is your feature matrix
dist_mat = euclidean_distances(X_train)

# Output the Euclidean distance matrix
print(dist_mat)


[[0.         1.38549948 1.34140515 ... 1.32964123 1.39802425 1.31584736]
 [1.38549948 0.         1.38698005 ... 1.38907098 1.39907044 1.37120861]
 [1.34140515 1.38698005 0.         ... 1.35229645 1.40204639 1.34614505]
 ...
 [1.32964123 1.38907098 1.35229645 ... 0.         1.399969   1.34452709]
 [1.39802425 1.39907044 1.40204639 ... 1.399969   0.         1.39816546]
 [1.31584736 1.37120861 1.34614505 ... 1.34452709 1.39816546 0.        ]]


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mode
import numpy as np

class KNNClassifier:
    def __init__(self, k):
        self.k = k
        self.train_x = None
        self.train_y = None

    def fit(self, X, y):
        self.train_x = X
        self.train_y = y

    def predict(self, X):
        # Compute the cosine similarity between X and train_x
        sim = cosine_similarity(X, self.train_x)

        # Get indices of k nearest neighbors
        top_k_indices = np.argsort(sim, axis=1)[:, -self.k:]

        # Get labels of k nearest neighbors
        top_k_labels = self.train_y[top_k_indices]

        # Predict the mode of the labels for each instance
        return mode(top_k_labels, axis=1).mode.flatten()

    def score(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy * 100

# Usage
knn_clf = KNNClassifier(k=10)
knn_clf.fit(X_train, y_train)
print(f"Accuracy: {knn_clf.score(X_test, y_test)}%")


Accuracy: 86.01769911504424%


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mode
knn_clf = KNNClassifier(k=10)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)

86.01769911504424