# Embeddings for Tabular Data

In [1]:
! nvidia-smi

Wed Jan 25 08:59:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:06.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! pip install pytorch-tabnet



### Import Libs

In [3]:
import warnings
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split

import torch
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE

### Options

In [4]:
warnings.filterwarnings("ignore")

### Load Dataset

In [5]:
# ! kaggle datasets download -d camnugent/california-housing-prices

In [6]:
df = pd.read_csv('../data/california-housing-prices.zip')

In [7]:
def housing_data_clean(input_df):
	input_df['rooms_per_household'] = input_df['total_rooms']/input_df['households']
	input_df['bedrooms_per_household'] = input_df['total_bedrooms']/input_df['households']
	input_df['bedrooms_per_room'] = input_df['total_bedrooms']/input_df['total_rooms']
	input_df['population_per_household'] = input_df['population']/input_df['households']
	input_df = input_df.drop(['total_bedrooms','total_rooms'], axis=1)
	return input_df

In [8]:
df = housing_data_clean(df)

In [9]:
# Divide by 1.5 to limit the number of income categories
df["income_cat"] = np.ceil(df["median_income"] / 1.5)
# Label those above 5 as 5
df["income_cat"].where(df["income_cat"] < 5, 5.0, inplace=True)
#this change below is because column names can't be passed to xgboost with a < symbol
df['ocean_proximity'][df['ocean_proximity'] == '<1H OCEAN'] = 'LessThan1h'

In [10]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_household,bedrooms_per_room,population_per_household,income_cat
0,-122.23,37.88,41.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,1.02381,0.146591,2.555556,5.0
1,-122.22,37.86,21.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.97188,0.155797,2.109842,5.0
2,-122.24,37.85,52.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,1.073446,0.129516,2.80226,5.0
3,-122.25,37.85,52.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,1.073059,0.184458,2.547945,4.0
4,-122.25,37.85,52.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,1.081081,0.172096,2.181467,3.0


In [11]:
df.shape

(20640, 13)

In [12]:
target = 'median_house_value'

In [13]:
if 'set' not in df.columns:
    df['set'] = np.random.choice(['train', 'valid', 'test'], p =[.8, .1, .1], size=(df.shape[0],))

In [14]:
train_indices = df[df.set=='train'].index
valid_indices = df[df.set=='valid'].index
test_indices = df[df.set=='test'].index

### Label Encoding and Filling NA

In [15]:
categorical_columns = []
categorical_dims =  {}

In [16]:
for col in df.columns[df.dtypes == object]:
    print(col, df[col].nunique())
    l_enc = LabelEncoder()
    df[col] = df[col].fillna('UNK')
    df[col] = l_enc.fit_transform(df[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

ocean_proximity 5
set 3


In [17]:
for col in df.columns[df.dtypes == 'float64']:
    df.fillna(df.loc[train_indices, col].mean(), inplace=True)

In [18]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_household,bedrooms_per_room,population_per_household,income_cat,set
0,-122.23,37.88,41.0,322.0,126.0,8.3252,452600.0,3,6.984127,1.02381,0.146591,2.555556,5.0,0
1,-122.22,37.86,21.0,2401.0,1138.0,8.3014,358500.0,3,6.238137,0.97188,0.155797,2.109842,5.0,1
2,-122.24,37.85,52.0,496.0,177.0,7.2574,352100.0,3,8.288136,1.073446,0.129516,2.80226,5.0,1
3,-122.25,37.85,52.0,558.0,219.0,5.6431,341300.0,3,5.817352,1.073059,0.184458,2.547945,4.0,1
4,-122.25,37.85,52.0,565.0,259.0,3.8462,342200.0,3,6.281853,1.081081,0.172096,2.181467,3.0,1


In [19]:
df.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity               int64
rooms_per_household         float64
bedrooms_per_household      float64
bedrooms_per_room           float64
population_per_household    float64
income_cat                  float64
set                           int64
dtype: object

### Define categorical features for categorical embeddings

In [20]:
unused_feat = ['set']

features = [col for col in df.columns if col not in unused_feat+[target]] 

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
print(cat_dims)

[5]


In [21]:
# define your embedding sizes : here just a random choice
cat_emb_dim = [6]

### TabNet Model

In [22]:
tabnet_params = {"cat_idxs": cat_idxs,
                 "cat_dims": cat_dims,
                 "cat_emb_dim": 2,
                 "optimizer_fn": torch.optim.Adam,
                 "optimizer_params": dict(lr=2e-2),
                 "scheduler_params": {"step_size":50, # how to use learning rate scheduler
                                      "gamma":0.9},
                 "scheduler_fn": torch.optim.lr_scheduler.StepLR,
                 "mask_type": 'entmax', # "sparsemax"
                 "verbose": 10,
                 "seed": 42
                }

In [23]:
model = TabNetRegressor(**tabnet_params)

In [24]:
X_train = df[features].values[train_indices]
y_train = df[target].values[train_indices].reshape(-1, 1)

X_valid = df[features].values[valid_indices]
y_valid = df[target].values[valid_indices].reshape(-1, 1)

X_test = df[features].values[test_indices]
y_test = df[target].values[test_indices].reshape(-1, 1)

In [25]:
max_epochs = 100 if not os.getenv("CI", False) else 2
max_epochs

100

In [26]:
aug = RegressionSMOTE(p=0.2)

In [None]:
model.fit(
    X_train=X_train, 
    y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['rmse'],
    patience=300, 
    max_epochs=1000,
    augmentations=aug,
    num_workers=8
) 

epoch 0  | loss: 55274484736.0| train_rmse: 237512.53778| valid_rmse: 231995.55654|  0:00:02s
epoch 10 | loss: 55026726400.0| train_rmse: 236279.46691| valid_rmse: 230797.0206|  0:00:26s
epoch 20 | loss: 54155904000.0| train_rmse: 233990.97375| valid_rmse: 228576.15413|  0:00:49s
epoch 30 | loss: 53011591424.0| train_rmse: 231591.7928| valid_rmse: 226246.93454|  0:01:11s
epoch 40 | loss: 51333659136.0| train_rmse: 229055.18881| valid_rmse: 223846.19613|  0:01:34s
epoch 50 | loss: 49715982080.0| train_rmse: 225344.79147| valid_rmse: 220226.30861|  0:01:57s
epoch 60 | loss: 48382377728.0| train_rmse: 219348.49538| valid_rmse: 214419.94114|  0:02:20s
epoch 70 | loss: 46460562688.0| train_rmse: 218574.16358| valid_rmse: 213665.31584|  0:02:43s
epoch 80 | loss: 44648083200.0| train_rmse: 212240.28674| valid_rmse: 207508.00077|  0:03:05s
epoch 90 | loss: 42737497856.00001| train_rmse: 208653.37262| valid_rmse: 204152.99448|  0:03:29s
epoch 100| loss: 40800419328.0| train_rmse: 204565.05624| 

In [None]:
preds = model.predict(X_test)

In [None]:
y_true = y_test

test_score = r2_score(y_pred=preds, y_true=y_true)

# print(f"BEST VALID SCORE: {model.best_cost}")
print(f"FINAL TEST SCORE: {test_score}")

In [None]:
model.feature_importances_