# Encoding categorical variables (MADlib v1.10+)
This is the new module that replaces create_indicator_variables() which has been deprecated as of MADlib v1.10

In [None]:
%load_ext sql

In [None]:
%sql postgresql://gpdbchina@10.194.10.68:55000/madlib
#%sql postgresql://fmcquillan@localhost:5432/madlib
#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin

In [19]:
%sql select madlib.version();
#%sql select version();

1 rows affected.


version
"MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0"


## 1.  Load data set
Use a subset of the abalone dataset:

In [20]:
%%sql 
DROP TABLE IF EXISTS abalone;

CREATE TABLE abalone (
    id serial,
    sex character varying,
    length double precision,
    diameter double precision,
    height double precision,
    rings int
);

INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M',    0.455,  0.365,  0.095,  15),
('M',    0.35,   0.265,  0.09,   7),
('F',    0.53,   0.42,   0.135,  9),
('M',    0.44,   0.365,  0.125,  10),
('I',    0.33,   0.255,  0.08,   7),
('I',    0.425,  0.3,    0.095,  8),
('F',    0.53,   0.415,  0.15,   20),
('F',    0.545,  0.425,  0.125,  16),
('M',    0.475,  0.37,   0.125,  9),
(NULL,   0.55,   0.44,   0.15,   19),
('F',    0.525,  0.38,   0.14,   14),
('M',    0.43,   0.35,   0.11,   10),
('M',    0.49,   0.38,   0.135,  11),
('F',    0.535,  0.405,  0.145,  10),
('F',    0.47,   0.355,  0.1,    10),
('M',    0.5,    0.4,    0.13,   12),
('I',    0.355,  0.28,   0.085,  7),
('F',    0.44,   0.34,   0.1,    10),
('M',    0.365,  0.295,  0.08,   7),
(NULL,   0.45,   0.32,   0.1,    9);

SELECT * FROM abalone ORDER BY id;

Done.
Done.
20 rows affected.
20 rows affected.


id,sex,length,diameter,height,rings
1,M,0.455,0.365,0.095,15
2,M,0.35,0.265,0.09,7
3,F,0.53,0.42,0.135,9
4,M,0.44,0.365,0.125,10
5,I,0.33,0.255,0.08,7
6,I,0.425,0.3,0.095,8
7,F,0.53,0.415,0.15,20
8,F,0.545,0.425,0.125,16
9,M,0.475,0.37,0.125,9
10,,0.55,0.44,0.15,19


## 2. Create new table with one-hot encoding.
The column 'sex' is replaced by three columns encoding the values 'F', 'M' and 'I'.  Null values are not encoded by default:

In [31]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        'sex'                        -- Categorical columns
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,length,diameter,height,rings,sex_F,sex_I,sex_M
1,0.455,0.365,0.095,15,0,0,1
2,0.35,0.265,0.09,7,0,0,1
3,0.53,0.42,0.135,9,1,0,0
4,0.44,0.365,0.125,10,0,0,1
5,0.33,0.255,0.08,7,0,1,0
6,0.425,0.3,0.095,8,0,1,0
7,0.53,0.415,0.15,20,1,0,0
8,0.545,0.425,0.125,16,1,0,0
9,0.475,0.37,0.125,9,0,0,1
10,0.55,0.44,0.15,19,0,0,0


## 3.  Encode null values
Now include NULL values in encoding (note the additional column 'sex_NULL'):

In [32]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        'sex',                       -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        NULL,                        -- Index columns
        NULL,                        -- Top values
        NULL,                        -- Value to drop for dummy encoding
        TRUE                         -- Encode nulls
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,length,diameter,height,rings,sex_F,sex_I,sex_M,sex_NULL
1,0.455,0.365,0.095,15,0,0,1,0
2,0.35,0.265,0.09,7,0,0,1,0
3,0.53,0.42,0.135,9,1,0,0,0
4,0.44,0.365,0.125,10,0,0,1,0
5,0.33,0.255,0.08,7,0,1,0,0
6,0.425,0.3,0.095,8,0,1,0,0
7,0.53,0.415,0.15,20,1,0,0,0
8,0.545,0.425,0.125,16,1,0,0,0
9,0.475,0.37,0.125,9,0,0,1,0
10,0.55,0.44,0.15,19,0,0,0,1


## 4. Encode all categorical variables and specify an index
Encode all categorical variables in the source table. Also, specify the column 'id' as the index (primary key) - this changes the output table to only include the index and the encoded variables:

In [33]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id'                         -- Index columns
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,sex_F,sex_I,sex_M,rings_7,rings_8,rings_9,rings_10,rings_11,rings_12,rings_14,rings_15,rings_16,rings_19,rings_20
1,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5,0,1,0,1,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0,0,0,1
8,1,0,0,0,0,0,0,0,0,0,0,1,0,0
9,0,0,1,0,0,1,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,0,0,1,0


## 5. Encode top values
Now let's encode only the top values and group others into a miscellaneous bucket column. Top values can be global across all columns or specified by column.  As an example of the latter, here are the top 2 'sex' values and the top 50% of  'rings' values:

In [34]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id',                        -- Index columns
        'sex=2, rings=0.5'           -- Top values
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,sex_M,sex_F,sex__MISC__,rings_10,rings_7,rings_9,rings__MISC__
1,1,0,0,0,0,0,1
2,1,0,0,0,1,0,0
3,0,1,0,0,0,1,0
4,1,0,0,1,0,0,0
5,0,0,1,0,1,0,0
6,0,0,1,0,0,0,1
7,0,1,0,0,0,0,1
8,0,1,0,0,0,0,1
9,1,0,0,0,0,1,0
10,0,0,0,0,0,0,1


## 6. Show raw variables and encoded variables together
If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variables 'sex' and 'rings' in the index parameter. (Remember that this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all.)

In [35]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        'sex, rings',                -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id, sex, rings'             -- Index columns
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,sex,rings,sex_F,sex_I,sex_M,rings_7,rings_8,rings_9,rings_10,rings_11,rings_12,rings_14,rings_15,rings_16,rings_19,rings_20
1,M,15,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,M,7,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,F,9,1,0,0,0,0,1,0,0,0,0,0,0,0,0
4,M,10,0,0,1,0,0,0,1,0,0,0,0,0,0,0
5,I,7,0,1,0,1,0,0,0,0,0,0,0,0,0,0
6,I,8,0,1,0,0,1,0,0,0,0,0,0,0,0,0
7,F,20,1,0,0,0,0,0,0,0,0,0,0,0,0,1
8,F,16,1,0,0,0,0,0,0,0,0,0,0,1,0,0
9,M,9,0,0,1,0,0,1,0,0,0,0,0,0,0,0
10,,19,0,0,0,0,0,0,0,0,0,0,0,0,1,0


## 7. Dummy encoding
For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference.
Here we use the 'value_to_drop' parameter:

In [36]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        'rings',                     -- Categorical columns to exclude
        'id',                        -- Index columns
        NULL,                        -- Top value
        'sex=I'                      -- Value to drop for dummy encoding              
        );
SELECT * FROM abalone_out ORDER BY id;

Done.
1 rows affected.
20 rows affected.


id,sex_F,sex_M
1,0,1
2,0,1
3,1,0
4,0,1
5,0,0
6,0,0
7,1,0
8,1,0
9,0,1
10,0,0


## 8. Array output
Create an array output for the two categorical variables in the source table:

In [42]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id',                        -- Index columns
        NULL,                        -- Top values
        NULL,                        -- Value to drop for dummy encoding
        NULL,                        -- Encode nulls
        NULL                         -- Output type
        );
SELECT * FROM abalone_out ORDER BY id;

Done.


InternalError: (psycopg2.InternalError) plpy.Error: Encoding categorical: Output type should be one of ('column', 'array', 'svec') (plpython.c:4648)
CONTEXT:  Traceback (most recent call last):
  PL/Python function "encode_categorical_variables", line 23, in <module>
    return encode_categorical.encode_categorical_variables(**globals())
  PL/Python function "encode_categorical_variables", line 585, in encode_categorical_variables
  PL/Python function "encode_categorical_variables", line 86, in __init__
  PL/Python function "encode_categorical_variables", line 232, in _validate_parameters
  PL/Python function "encode_categorical_variables", line 48, in _assert
PL/Python function "encode_categorical_variables"
 [SQL: "SELECT madlib.encode_categorical_variables (\n        'abalone',                   -- Source table\n        'abalone_out',               -- Output table\n        '*',                         -- Categorical columns\n        NULL,                        -- Categorical columns to exclude\n        'id',                        -- Index columns\n        NULL,                        -- Top values\n        NULL,                        -- Value to drop for dummy encoding\n        NULL,                        -- Encode nulls\n        NULL                         -- Output type\n        );"]

And here is the dictionary table that specifies the index into the array:

In [None]:
%sql SELECT * FROM abalone_out_dictionary;

## 9. Dictionary output
Create a dictionary:

In [None]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id',                        -- Index columns
        NULL,                        -- Top values
        NULL,                        -- Value to drop for dummy encoding
        NULL,                        -- Encode nulls
        FALSE,                       -- Array output
        TRUE                         -- Dictionary output 
        );
SELECT * FROM abalone_out ORDER BY id;

And here is the dictionary table that defines the columns in the output table:

In [None]:
%sql SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;

## 10. Distribution policy
We can chose for various distribution policies, for examply RANDOMLY:

In [None]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        '*',                         -- Categorical columns
        NULL,                        -- Categorical columns to exclude
        'id',                        -- Index columns
        NULL,                        -- Top values
        NULL,                        -- Value to drop for dummy encoding
        NULL,                        -- Encode nulls
        NULL,                        -- Array output
        NULL,                        -- Dictionary output
        'RANDOMLY'                   -- Distribution policy
        );

## 11. Encoding floats
If you have a float that you want to encode, you can cast it in the following way within the function call:

In [None]:
%%sql
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
        'abalone',                   -- Source table
        'abalone_out',               -- Output table
        'height>.05'               -- Categorical columns
        );

In [None]:
%%sql
SELECT * FROM abalone_out ORDER BY id;