In [1]:
from pytorch_tabnet.pretraining import TabNetPretrainer

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity


import pandas as pd
import numpy as np
np.random.seed(0)
import random


import os
import pickle

from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import time

%matplotlib inline

# Adul dataset

In [6]:
# | Variable Name    | Role       | Type      | Demographic         | Description          | Units | Missing Values |
# |------------------|------------|-----------|---------------------|----------------------|-------|----------------|
# | age              | Feature    | Integer   | Age                 | N/A                  |       | no             |
# | workclass        | Feature    | Categorical | Income            | Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked | | yes           |
# | fnlwgt           | Feature    | Integer   |                     |                      |       | no             |
# | education        | Feature    | Categorical | Education Level   | Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool | | no |
# | education-num    | Feature    | Integer   | Education Level   |                      |       | no             |
# | marital-status   | Feature    | Categorical | Other             | Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse | | no |
# | occupation       | Feature    | Categorical | Other             | Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces | | yes |
# | relationship     | Feature    | Categorical | Other             | Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried | | no |
# | race             | Feature    | Categorical | Race              | White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black | | no |
# | sex              | Feature    | Binary    | Sex                 | Female, Male         |       |                |



t0 = time.time()
# Read data
columns = ["age", "workClass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex", 
           "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('data/adult/adult.data', names=columns, sep=r' *, *', engine='python', na_values='?')
# Drop useless columns
train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
train_data.dropna(inplace=True)
# Reset the index
train_data.reset_index(drop=True, inplace=True)

nunique = train_data.nunique()
types = train_data.dtypes

mapping_dict = {}
categorical_columns = []
categorical_dims =  {}
for col in train_data.columns:
    mapping_dict[col] = {}
    if types[col] == 'object' or nunique[col] < 200:
        l_enc = LabelEncoder()
        train_data[col] = train_data[col].fillna("VV_likely")
        train_data[col] = l_enc.fit_transform(train_data[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
        for i, c in enumerate(l_enc.classes_):
             mapping_dict[col][i] = c
                

features = [ col for col in train_data.columns]
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

n = len(train_data)
m = int(0.1*n)
idxs = np.arange(n)
random.shuffle(idxs)

X_train = train_data[features].values[idxs[m:]]
X_valid = train_data[features].values[idxs[:m]]

# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=10,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    n_shared_decoder=1, # nb shared glu for decoding
    n_indep_decoder=1, # nb independent glu for decoding
#     grouped_features=[[0, 1]], # you can group features together here
    verbose=10,
)

max_epochs = 200 if not os.getenv("CI", False) else 2 # 1000
unsupervised_model.fit(
    X_train=X_train,
    eval_set=[X_valid],
    max_epochs=max_epochs , patience=10,
    batch_size=2048, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.5,
) 

# Extract embeddings
_, embedded_X = unsupervised_model.predict(train_data[features].values)

attribute_embedding = {col: {} for col in train_data.columns if 'income' not in col}
i = 0
for col in attribute_embedding:
    unique_values = train_data[col].unique()
    for unique_val in unique_values:
        mask = train_data[col] == unique_val
        idx = train_data.index[mask].tolist()[0]
        attribute_embedding[col][mapping_dict[col][unique_val]] = embedded_X[idx, i:i+10]
    i+=10
file_path = 'data/adult/embeddings.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(attribute_embedding, file)
# Load the dictionary back from the pickle file
with open(file_path, 'rb') as file:
    attribute_embedding = pickle.load(file)

numrical_attributes = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
file_path = 'data/adult/embeddings.pkl'
# Load the dictionary from the pickle file
with open(file_path, 'rb') as file:
    attribute_embedding = pickle.load(file)

utility_dict = {attribute:[] for attribute in attribute_embedding}

for attribute in attribute_embedding:
    if attribute in numrical_attributes:
        array = np.array(list(attribute_embedding[attribute].keys()))
        max_a = np.max(array)
        utility_list = []
        for i in range(len(array)):
            for j in range(i+1, len(array)):
                utility = (max_a - abs(array[i] - array[j]))/max_a
                utility_list.append([array[i], array[j], utility])
        
        utility_dict[attribute] = utility_list
    
    else:
        values = list(attribute_embedding[attribute].keys())
        utility_list = []
        for i in range(len(values)):
            for j in range(i + 1, len(values)):
                vec_i = attribute_embedding[attribute][values[i]].reshape(1, -1)
                vec_j = attribute_embedding[attribute][values[j]].reshape(1, -1)
                # Compute cosine similarity
                utility = (cosine_similarity(vec_i, vec_j)[0][0] + 1)/2
                utility_list.append([values[i], values[j], utility])
        
        utility_dict[attribute] = utility_list

file_path = 'data/adult/utilities.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(utility_dict, file)
    
t = time.time() - t0
print('Embedding and utility calculation runtime:', t)

(32561, 15) (16282, 15)


In [7]:
32561+16282

48843

In [None]:
# Embedding and utility calculation runtime: 167.43188667297363

In [None]:
# Visualize embeddings per attribute

occupation_embeddings = attribute_embedding['marital-status']


# Prepare the data for t-SNE
occupations = list(occupation_embeddings.keys())
embeddings = np.array([occupation_embeddings[occupation] for occupation in occupations])

# Determine an appropriate perplexity value (less than the number of samples)
perplexity_value = min(30, len(occupations)-1)

# Apply PCA and reduce to 2 dimensions
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plotting
plt.figure(figsize=(8, 6))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='brown')

# Annotate points with occupation names
for i, occupation in enumerate(occupations):
    plt.annotate(occupation, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.title('occupation embeddings')
plt.show()

