## Purpose 
Transformations that can run on GPU

In [4]:
from sklearn.base import TransformerMixin
import cudf
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from IPython.display import display

assert cudf.__version__ == '0.14.0'

class CuCategoryEncoder(TransformerMixin):
    """
    Encodes categorical variables into integers. Runs on GPU using cudf.

    Why? Once fit method is called, sklearn.preprocessing.LabelEncoder cannot encode new categories.
    In this category encoder, fit can be called any number times. It encodes categories which it has not seen before,
    without changing the encoding of existing categories.
    """
    # categories as series
    cats = {}

    def __init__(self, cols, encodings_path='encodings.pkl', auto_fit=False, share_cats=False):
        """

        :param cols: List of columns to be encoded
        :param encodings_path: Load and save encodings from this path, if not none
        :param auto_fit: Fit is called before every transform. So we can handle previously unseen values
        :param share_cats: Use the same mapping across cols. So a category will get encoded to same value across columns
        """
        assert type(cols) == list
        self.cols = cols
        self.auto_fit = auto_fit
        self.share_cats = share_cats
        self.encodings_path = encodings_path
        if self.encodings_path and Path(self.encodings_path).is_file():
            with open(self.encodings_path, 'rb') as f:
                self.cats = pickle.load(f)

    def fit(self, df):
        for col in self.cols:

            # set the key for the category df
            if self.share_cats:
                cat_key = 'shared'
            else:
                cat_key = col

            # use the right datatype
            dtype = df[col].dtype
            if self.cats.get(cat_key) is None:
                self.cats[cat_key] = cudf.DataFrame({'cats': []}, dtype=dtype)

                # join values with categories and filter out the matches
            joined = df.merge(self.cats[cat_key], left_on=col, right_on='cats', how='left')
            new_cats = cudf.DataFrame({'cats': joined[joined.cats.isnull()][col].unique()})

            # append new cats to exiting cats
            self.cats[cat_key] = cudf.concat([self.cats[cat_key], new_cats], ignore_index=True)
            if self.encodings_path:
                with open(self.encodings_path, 'wb') as f:
                    pickle.dump(self.cats, f)
        return self

    def transform(self, df):
        if self.auto_fit:
            self.fit(df)

        for col in self.cols:
            # set the key for the category df
            if self.share_cats:
                cat_key = 'shared'
            else:
                cat_key = col

            df = df.merge(self.cats[cat_key].reset_index(), left_on=col, right_on='cats', how='left') \
                .drop([col, 'cats'], axis=1).rename(columns={'index': col})
        return df

    def inverse_transform(self, df):
        for col in self.cols:
            # set the key for the category df
            if self.share_cats:
                cat_key = 'shared'
            else:
                cat_key = col

            df = df.merge(self.cats[cat_key].reset_index(), how='left', left_on=col, right_on='index') \
                .drop(['index', col]).rename(columns={'cats': col})
        return df


#### Testing it

In [5]:
# Testing it
pdf = pd.DataFrame({
    'sa': np.random.randint(1,10,2),
    'da': np.random.randint(10,20,2)
})
df = cudf.from_pandas(pdf)
encoder = CuCategoryEncoder(['sa','da'],auto_fit=True, share_cats=True)

# keep repeating the following lines
encoder.fit(df)
# print(f'df:\n ', df.head())
display(df)
print(f'encoded categories:')
display(encoder.cats['shared'])
transformed = encoder.transform(df)
print(f'\ntransformed:')
display(transformed.head())
print(f'\ninv-transform:')
display(encoder.inverse_transform(transformed))

Unnamed: 0,sa,da
0,9,10
1,4,12


encoded categories:


Unnamed: 0,cats
0,4
1,9
2,10
3,12



transformed:


Unnamed: 0,sa,da
0,1,2
1,0,3



inv-transform:


Unnamed: 0,sa,da
0,9,10
1,4,12


### Playground

In [None]:
import cugraph

In [3]:
!rm encodings.pkl