In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsTransformer
from sklearn.manifold import TSNE
from scipy.sparse import csr_matrix
from nlp import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
class TextCleanTransformer(BaseEstimator, TransformerMixin):
    """Wrapper for using encoding and decoding to clean and transform unstructured text"""
    
    #the constructor
    def __init__(self, text_cols, encoding_utf8 = True, encoding_replace=True, verbose=True):
        
        self.text_cols = text_cols
        self.encoding_utf8 = encoding_utf8
        self.encoding_replace = encoding_replace
        self.verbose = verbose
        
    #estimator method
    def fit(self, X, y = None):
        
        return self
    
    #transformation
    def fit_transform(self, X, y = None):
        
        process = True
        start = datetime.datetime.now()
        
        try:
            if len(self.text_cols)==1:
                if self.verbose:
                    print('Columns: ', self.text_cols[0])
                if self.encoding_utf8 and self.encoding_replace:
                    X[self.text_cols[0]] = [x.encode('utf-8','replace') for x in X[self.text_cols[0]]]
                elif self.encoding_utf8:
                    X[self.text_cols[0]] = [x.encode('utf-8','ignore') for x in X[self.text_cols[0]]]
            else:
                for col_name in self.text_cols:
                    if self.verbose:
                        print('Columns: ', col_name)
                    if self.encoding_utf8 and self.encoding_replace:
                        X[col_name] = [x.encode('utf-8','replace') for x in X[col_name]]
                    elif self.encoding_utf8:
                        X[col_name] = [x.encode('utf-8','ignore') for x in X[col_name]]
        except Exception as err:
            if self.verbose:
                print('Error: ', err)
        end = datetime.datetime.now()
        diff = end-start
        if self.verbose:
            print(diff.seconds)
        return X

class NMSlibTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=1):
        self.n_neighbors = n_neighbors
        self.method = method
        self.metric = metric
        self.n_jobs = n_jobs

    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]

        # see more metric in the manual
        # https://github.com/nmslib/nmslib/tree/master/manual
        space = {
            "euclidean": "l2",
            "cosine": "cosinesimil",
            "l1": "l1",
            "l2": "l2",
        }[self.metric]

        self.nmslib_ = nmslib.init(method=self.method, space=space)
        self.nmslib_.addDataPointBatch(X)
        self.nmslib_.createIndex()
        return self

    def transform(self, X):
        n_samples_transform = X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors, num_threads=self.n_jobs)
        indices, distances = zip(*results)
        indices, distances = np.vstack(indices), np.vstack(distances)

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
        kneighbors_graph = csr_matrix(
            (distances.ravel(), indices.ravel(), indptr),
            shape=(n_samples_transform, self.n_samples_fit_),
        )

        return kneighbors_graph

#https://scikit-learn.org/stable/auto_examples/neighbors/approximate_nearest_neighbors.html#sphx-glr-auto-examples-neighbors-approximate-nearest-neighbors-py
class AnnoyTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric="euclidean", n_trees=10, search_k=-1):
        self.n_neighbors = n_neighbors
        self.n_trees = n_trees
        self.search_k = search_k
        self.metric = metric

    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]
        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=self.metric)
        for i, x in enumerate(X):
            self.annoy_.add_item(i, x.tolist())
        self.annoy_.build(self.n_trees)
        return self

    def transform(self, X):
        return self._transform(X)

    def fit_transform(self, X, y=None):
        return self.fit(X)._transform(X=None)

    def _transform(self, X):
        """As `transform`, but handles X is None for faster `fit_transform`."""

        n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        indices = np.empty((n_samples_transform, n_neighbors), dtype=int)
        distances = np.empty((n_samples_transform, n_neighbors))

        if X is None:
            for i in range(self.annoy_.get_n_items()):
                ind, dist = self.annoy_.get_nns_by_item(
                    i, n_neighbors, self.search_k, include_distances=True
                )

                indices[i], distances[i] = ind, dist
        else:
            for i, x in enumerate(X):
                indices[i], distances[i] = self.annoy_.get_nns_by_vector(
                    x.tolist(), n_neighbors, self.search_k, include_distances=True
                )

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
        kneighbors_graph = csr_matrix(
            (distances.ravel(), indices.ravel(), indptr),
            shape=(n_samples_transform, self.n_samples_fit_),
        )

        return kneighbors_graph

In [9]:
def test_case():
    dataset = load_dataset('glue', 'mrpc', split='train')
    eg_dataset_df = pd.DataFrame([dataset['sentence1'],dataset['sentence2'],dataset['label']]).T
    eg_dataset_df.columns = ['sentence1','sentence2','label']
    print(eg_dataset_df)
    #the numeric attributes transformation pipeline
    text_pipeline = Pipeline([
            ('text_clean', TextCleanTransformer(['sentence1','sentence2'],True,True))])
    #perform the fit transform
    eg_dataset_df_clean = text_pipeline.fit_transform(eg_dataset_df)
    print(eg_dataset_df_clean)

In [10]:
test_case()

                                              sentence1  \
0     Amrozi accused his brother , whom he called " ...   
1     Yucaipa owned Dominick 's before selling the c...   
2     They had published an advertisement on the Int...   
3     Around 0335 GMT , Tab shares were up 19 cents ...   
4     The stock rose $ 2.11 , or about 11 percent , ...   
...                                                 ...   
3663  " At this point , Mr. Brando announced : ' Som...   
3664  Martin , 58 , will be freed today after servin...   
3665  " We have concluded that the outlook for price...   
3666  The notification was first reported Friday by ...   
3667  The 30-year bond US30YT = RR rose 22 / 32 for ...   

                                              sentence2 label  
0     Referring to him as only " the witness " , Amr...     1  
1     Yucaipa bought Dominick 's in 1995 for $ 693 m...     0  
2     On June 10 , the ship 's owners had published ...     1  
3     Tab shares jumped 20 cents , 