# Neural Cats Embeddings

In [1]:
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
import sys
sys.path.append('..')
from df4cats.frames import CodedDF
from df4cats.embedding.generators import SiameseGeneratorDF
from df4cats.embedding.models import Siamese, Embedder
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
# %load_ext autoreload
# %autoreload 2

Using TensorFlow backend.


## Load the data
### Load CSV

In [2]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income-class']
sets = ['train', 'test']
data = {}
data_folder = Path('../sample_data/adult_income')
for kind in sets:
    data[kind] = pd.read_csv(data_folder / f'adult.{kind}', names=cols, sep=', ')
    data[kind].replace('?', np.nan, inplace=True)

  


### Create CodedDF
- Encode and normalize train and test (validation) data.
- Hardcode variables.

In [3]:
hard_cats = yaml.load(open(data_folder / 'adult.yaml', 'r'), Loader=yaml.FullLoader)
cdf = {}
cdf_hard = {}
for kind in sets:
    cdf[kind] = CodedDF(data[kind], categorical_columns=list(hard_cats), label_columns=['income-class'], normalize=True)
    cdf_hard[kind] = cdf[kind].hardcode_categories(hard_cats, add_other=True, add_nan=True)
    

## Train embedder

### Get generators

In [5]:
dfgen = {}
for kind in sets:
    dfgen[kind] = SiameseGeneratorDF(X=cdf_hard[kind].data, y=cdf_hard[kind].data['income-class'], batch_size=20, columns=cdf_hard[kind].categorical_columns + cdf_hard[kind].continuous_columns)

### Create Models

In [6]:
embedder = Embedder(categorical_features= cdf_hard['train'].category_dimensions(),
        continuous_features= cdf_hard['train'].continuous_columns,
        n_dense=2,
        nodes_per_dense=1000,
        output_dim = 1000,
        max_embedding_dim=600,
        dropout_rate=0.2,
        residual=False,)
siamese = Siamese(embedder.model)
siamese.joint.compile(optimizer='Adam', loss='binary_crossentropy')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




### Train models

In [7]:
stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1, mode='min', restore_best_weights=True)
lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=0, mode='min', verbose=1)
# tboard = TensorBoard(log_dir=str(data_folder))
# checkpoint_val_loss = ModelCheckpoint(str(data_folder / 'siamese_weights_file_e{epoch:03d}-l{val_loss:.2f}.hdf5'),
#             save_weights_only=True,
#             verbose=True,
#             save_best_only=True,
#             monitor='val_loss',
#             mode='min',
#         )


In [8]:
siamese.joint.fit_generator(generator=dfgen['train'], epochs=35, validation_data=dfgen['test'], callbacks=[stopping, lr_schedule])


Instructions for updating:
Use tf.cast instead.
Epoch 1/35
Epoch 2/35

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35

Epoch 00006: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 7/35
Restoring model weights from the end of the best epoch

Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 00007: early stopping


<keras.callbacks.History at 0xb2b7a4710>

In [10]:
siamese.get_twin().save_weights(data_folder / 'embedder_weights_file.hdf5') ## CHECK IF LOADING IS NEEDED

In [11]:
embed = embedder.get_embedding_model()

In [12]:
embed.load_weights(data_folder / 'embedder_weights_file.hdf5', by_name=True)

In [13]:
in_dict = embedder.get_input_dictionary(cdf['train'].data)
pred = embed.predict(in_dict)

In [14]:
embedder.predictions_to_df(pred).head(5)

Unnamed: 0,workclass_emb_0,workclass_emb_1,workclass_emb_2,workclass_emb_3,workclass_emb_4,workclass_emb_5,education_emb_0,education_emb_1,education_emb_2,education_emb_3,...,native-country_emb_3,native-country_emb_4,native-country_emb_5,native-country_emb_6,native-country_emb_7,native-country_emb_8,native-country_emb_9,native-country_emb_10,native-country_emb_11,native-country_emb_12
0,0.020849,0.093935,-0.062451,-0.046488,-0.014605,0.047912,-0.066307,0.000257,-0.050018,-0.090214,...,-0.000127,0.005007,-0.004072,-0.003681,-0.002837,0.001852,0.001243,-0.001648,0.002049,-0.008546
1,-0.10108,-0.120845,-0.076645,0.006551,-0.016419,0.030585,-0.066307,0.000257,-0.050018,-0.090214,...,-0.000127,0.005007,-0.004072,-0.003681,-0.002837,0.001852,0.001243,-0.001648,0.002049,-0.008546
2,0.114564,0.098814,-0.029882,0.031097,0.097797,-0.100882,0.020089,-0.023439,0.002053,0.01067,...,-0.000127,0.005007,-0.004072,-0.003681,-0.002837,0.001852,0.001243,-0.001648,0.002049,-0.008546
3,0.114564,0.098814,-0.029882,0.031097,0.097797,-0.100882,-0.056337,0.073852,-0.071254,-0.066528,...,-0.000127,0.005007,-0.004072,-0.003681,-0.002837,0.001852,0.001243,-0.001648,0.002049,-0.008546
4,0.114564,0.098814,-0.029882,0.031097,0.097797,-0.100882,-0.066307,0.000257,-0.050018,-0.090214,...,-0.134608,-0.06159,-0.019102,-8.3e-05,0.205481,-0.042845,0.080238,0.044077,-0.08611,0.019458


## Extra: get embeddings for each category, plot 3D embeddings

In [15]:
from keras.models import Model
category_embeddings = {}
for cat in embedder.categorical_features:
    inp = embed.get_layer(cat)
    out = embed.get_layer(f'{cat}_flat_emb')
    category_embeddings[cat] = Model(inputs=inp.input, outputs=out.output)

In [16]:
maps = {}
for cat in embedder.categorical_features:
    maps[cat] = {}
    for v in range(embedder.categorical_features[cat]):
        value = cdf['train'].categorical_mapping[cat].inverse_mapping[v]
        maps[cat][value] = category_embeddings[cat].predict([v])[0]

In [104]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib qt

import matplotlib.pyplot as plt
for cat in embedder.categorical_features:
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    for el in maps[cat]:
        point = maps[cat][el]
        ax.scatter(point[0], point[1], point[2], label=el)
        ax.text(point[0], point[1], point[2],  '%s' % (el), size=8, zorder=1, color='k') 
    plt.show()