----

## Setup

Load packages:

In [7]:
from RandomDataGenerators import *
from SparseMatrixRecommender import *

import random
import numpy as np
import pandas as pd
from typing import Dict, Any, Optional, Tuple, List

Testing routine:

In [8]:
def transform_train_test(
    dfTraining: pd.DataFrame,
    dfTesting: pd.DataFrame,
    n_unique_threshold: int = 5,
    use_log: bool = True,
    n_bins: int = 10,
    use_quantile_bins: bool = False,
    keep_original_numeric: bool = True,
    log_threshold: float = 0.0,
    below_log_threshold_value: Any = np.nan,
) -> Tuple[pd.DataFrame, pd.DataFrame, DataFrameCategorizer]:
    transformer = DataFrameCategorizer(
        n_unique_threshold=n_unique_threshold,
        use_log=use_log,
        n_bins=n_bins,
        use_quantile_bins=use_quantile_bins,
        keep_original_numeric=keep_original_numeric,
        log_threshold=log_threshold,
        below_log_threshold_value=below_log_threshold_value,
    )
    df_train_trans = transformer.fit_transform(dfTraining)
    df_test_trans = transformer.transform(dfTesting)
    return df_train_trans, df_test_trans, transformer

----

## Generate data frames

In [9]:
random.seed(1434)

dfData = random_data_frame(
    40, 
    ["mark", "type", "count", "time"], 
    generators={
        "mark": lambda size: random.choices(["A", "B", "C", "M"], k=size),
        "type": lambda size: random.choices(random_pet_name(8), k=size),
        "count": lambda size: random.choices(list(range(5)), k=size),
        "time": lambda size: numpy.random.normal(loc=100, scale=20, size=size)
        }
)

dfData

Unnamed: 0,mark,type,count,time
0,A,Evie,0,104.721873
1,M,Nellie,4,108.386687
2,C,Suki,0,108.68676
3,M,Evie,3,94.133719
4,A,Millie,3,93.48759
5,M,Suki,0,124.397787
6,B,Millie,2,100.343696
7,M,Millie,2,71.454025
8,C,Evie,4,68.869094
9,A,Millie,3,100.938442


In [10]:
dfData[["mark", "type"]].describe()

Unnamed: 0,mark,type
count,40,40
unique,4,8
top,A,Nellie
freq,14,8


In [11]:
dfData.describe()

Unnamed: 0,count,time
count,40.0,40.0
mean,2.075,97.599456
std,1.525636,18.109831
min,0.0,56.262719
25%,0.75,86.065402
50%,2.0,99.697201
75%,3.0,108.461705
max,4.0,135.418024


In [12]:
dfTraining = dfData.iloc[:30].reset_index(drop=True)
dfTesting = dfData.iloc[30:].reset_index(drop=True)

In [13]:
dfTrainingCat, dfTestingCat, objTrans = transform_train_test(dfTraining=dfTraining, dfTesting=dfTesting, use_quantile_bins=True, below_log_threshold_value = 0)

In [14]:
dfTrainingCat

Unnamed: 0,mark,type,count,time,time_log
0,A,Evie,0,"(100.65533302863595, 106.33085702508657]","(2.002836054652645, 2.0266392142682146]"
1,M,Nellie,4,"(106.33085702508657, 116.45937002917282]","(2.0266392142682146, 2.066066635822243]"
2,C,Suki,0,"(106.33085702508657, 116.45937002917282]","(2.0266392142682146, 2.066066635822243]"
3,M,Evie,3,"(93.74504711564319, 96.13608037720655]","(1.97194823971173, 1.9828851506725909]"
4,A,Millie,3,"(85.79193662639088, 93.74504711564319]","(1.9332578466790409, 1.97194823971173]"
5,M,Suki,0,"(116.45937002917282, 135.41802427100282]","(2.066066635822243, 2.1316764732152045]"
6,B,Millie,2,"(100.07769732109456, 100.65533302863595]","(2.0003364835306705, 2.002836054652645]"
7,M,Millie,2,"(69.59327102255668, 79.40660011986954]","(1.8425646208769244, 1.8998544372282444]"
8,C,Evie,4,"(56.26271872467755, 69.59327102255668]","(1.75022071429554, 1.8425646208769244]"
9,A,Millie,3,"(100.65533302863595, 106.33085702508657]","(2.002836054652645, 2.0266392142682146]"


In [15]:
dfTestingCat

Unnamed: 0,mark,type,count,time,time_log
0,A,Chai,3,"(116.45937002917282, 135.41802427100282]","(2.066066635822243, 2.1316764732152045]"
1,B,Millie,0,"(116.45937002917282, 135.41802427100282]","(2.066066635822243, 2.1316764732152045]"
2,C,Nellie,0,"(100.65533302863595, 106.33085702508657]","(2.002836054652645, 2.0266392142682146]"
3,C,Evie,3,"(93.74504711564319, 96.13608037720655]","(1.97194823971173, 1.9828851506725909]"
4,B,Chai,3,"(116.45937002917282, 135.41802427100282]","(2.066066635822243, 2.1316764732152045]"
5,A,Millie,2,"(96.13608037720655, 100.07769732109456]","(1.9828851506725909, 2.0003364835306705]"
6,B,Silvie,4,"(85.79193662639088, 93.74504711564319]","(1.9332578466790409, 1.97194823971173]"
7,M,Tiberius,0,"(106.33085702508657, 116.45937002917282]","(2.0266392142682146, 2.066066635822243]"
8,A,Callie,3,"(79.40660011986954, 85.79193662639088]","(1.8998544372282444, 1.9332578466790409]"
9,M,Tiberius,3,"(116.45937002917282, 135.41802427100282]","(2.066066635822243, 2.1316764732152045]"


In [16]:
dfTesting

Unnamed: 0,mark,type,count,time
0,A,Chai,3,117.521738
1,B,Millie,0,121.661426
2,C,Nellie,0,102.994723
3,C,Evie,3,94.126934
4,B,Chai,3,118.802592
5,A,Millie,2,99.666823
6,B,Silvie,4,93.522427
7,M,Tiberius,0,109.5435
8,A,Callie,3,81.209938
9,M,Tiberius,3,119.586851


In [17]:
objTrans.columns_info_

{'mark': {'type': 'string',
  'categories': Index(['A', 'M', 'C', 'B'], dtype='string')},
 'type': {'type': 'string',
  'categories': Index(['Evie', 'Nellie', 'Suki', 'Millie', 'Silvie', 'Chai', 'Callie',
         'Tiberius'],
        dtype='string')},
 'count': {'type': 'numeric_low_cardinality',
  'categories': ['0', '1', '2', '3', '4']},
 'time': {'type': 'numeric_binned', 'binned': True},
 'time_log': {'type': 'numeric_binned', 'binned': True}}

In [18]:
objTrans.bins_

{'time': IntervalIndex([               (-inf, 56.26271872467755],
                  (56.26271872467755, 69.59327102255668],
                  (69.59327102255668, 79.40660011986954],
                  (79.40660011986954, 85.79193662639088],
                  (85.79193662639088, 93.74504711564319],
                  (93.74504711564319, 96.13608037720655],
                 (96.13608037720655, 100.07769732109456],
                (100.07769732109456, 100.65533302863595],
                (100.65533302863595, 106.33085702508657],
                (106.33085702508657, 116.45937002917282],
                (116.45937002917282, 135.41802427100282],
                               (135.41802427100282, inf]],
               dtype='interval[float64, right]'),
 'time_log': IntervalIndex([                (-inf, 1.75022071429554],
                  (1.75022071429554, 1.8425646208769244],
                (1.8425646208769244, 1.8998544372282444],
                (1.8998544372282444, 1.9332578466790409],
 