In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
class KNNGzipClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, k=5, metric='euclidean', vectorizer=None):
        self.k = k
        self.metric = metric
        self.vectorizer = vectorizer
    
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        self.vectorizer_ = TfidfVectorizer() if self.vectorizer is None else self.vectorizer
        self.vectorizer_.fit(X)
        return self
    
    def predict(self, X):
        check_is_fitted(self, ['X_', 'y_'])
        X = check_array(X)
        X = self.vectorizer_.transform(X)
        distances = self._get_distances(X)
        return self._predict_from_distances(distances)
    
    def _get_distances(self, X):
        distances = []
        for i in range(X.shape[0]):
            distances.append(self._get_distance(X[i]))
        return np.array(distances)
    
    def _predict_from_distances(self, distances):
        predictions = []
        for i in range(distances.shape[0]):
            predictions.append(self._predict_one(distances[i]))
        return np.array(predictions)