In [1]:
!pip install tabpfn -q
!pip install scikit-learn -q
!pip install pandas -q
!pip install matplotlib -q
!pip install pytorch-tabnet -q

[0m

In [11]:
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import joblib
from pytorch_tabnet.tab_model import TabNetClassifier
import random
import torch
from torch.optim import AdamW
from math import log

In [12]:
rent_df = pd.read_csv('after_transformations.csv')

In [13]:
num_buckets = 6

# Create quantile-based buckets with integer labels
rent_df['rent_bucket'] = pd.qcut(rent_df['price'], q=num_buckets, labels=[i for i in range(num_buckets)])

# Display the bins and distribution
bin_edges = pd.qcut(rent_df['price'], q=num_buckets, retbins=True)[1]
distribution = rent_df['rent_bucket'].value_counts(normalize=True, sort=False)

label_to_range: dict[int, str] = {i: f"${bin_edges[i]:,.2f} - ${bin_edges[i+1]:,.2f}" for i in range(len(bin_edges) - 1)}

for i in range(len(bin_edges) - 1):
    print(f"Bucket {i+1}: ${bin_edges[i]:,.2f} - ${bin_edges[i+1]:,.2f} with {round(distribution.iloc[i]*100, ndigits=2)}% of data")

Bucket 1: $7.09 - $7.72 with 12.51% of data
Bucket 2: $7.72 - $7.88 with 12.49% of data
Bucket 3: $7.88 - $8.01 with 13.3% of data
Bucket 4: $8.01 - $8.10 with 11.79% of data
Bucket 5: $8.10 - $8.22 with 13.43% of data
Bucket 6: $8.22 - $8.38 with 11.71% of data
Bucket 7: $8.38 - $8.74 with 12.38% of data
Bucket 8: $8.74 - $11.74 with 12.39% of data


In [14]:
X = rent_df.drop(columns=['price', 'rent_bucket'])
y = rent_df['rent_bucket']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
class TabNetClassifierWrapper(TabNetClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def fit(self, X, y, *args, **kwargs):
        return super().fit(X, y, **kwargs)

    def predict(self, X):
        return super().predict(X)

    def predict_proba(self, X):
        return super().predict_proba(X)

In [38]:
param_grid = {
    'n_d': [8, 16, 32],                  # Dimension of decision prediction layer
    'n_a': [8, 16, 32],                  # Dimension of attention embedding
    'n_steps': [3, 5, 7],                # Number of decision steps
    'gamma': [1.0, 1.5, 2.0],            # Feature reuse coefficient
    'lambda_sparse': [1e-3, 1e-4, 1e-5], # Sparse regularization coefficient
    'clip_value': [1.0, 2.0, 3.0],       # Gradient clipping value
    'mask_type': ['sparsemax', 'entmax'], # Feature masking type
    'seed': [42, 123, 2024]              # Random seeds
}

In [19]:
random_params = {key: random.choice(value) for key, value in param_grid.items()}
print("Randomly Selected Parameters:")
print(random_params)

# Initialize TabNetClassifier with random parameters
tabnet = TabNetClassifier(
    n_d=random_params['n_d'],
    n_a=random_params['n_a'],
    n_steps=random_params['n_steps'],
    gamma=random_params['gamma'],
    lambda_sparse=random_params['lambda_sparse'],
    seed=random_params['seed'],
    verbose=1  # Enables detailed logging,
    sche
)

# Train the TabNet model
tabnet.fit(
    X_train.values, y_train.values,
    eval_set=[(X_test.values, y_test.values)],
    eval_name=['test'],
    eval_metric=['accuracy'],
    max_epochs=50,
    patience=10,
    batch_size=256,
    virtual_batch_size=128)

Randomly Selected Parameters:
{'n_d': 8, 'n_a': 8, 'n_steps': 3, 'gamma': 1.2, 'lambda_sparse': 0.001, 'optimizer_fn': None, 'seed': 42}
epoch 0  | loss: 2.21044 | test_accuracy: 0.14982 |  0:00:01s
epoch 1  | loss: 2.03935 | test_accuracy: 0.17061 |  0:00:01s
epoch 2  | loss: 1.88337 | test_accuracy: 0.14265 |  0:00:02s
epoch 3  | loss: 1.72949 | test_accuracy: 0.17491 |  0:00:02s
epoch 4  | loss: 1.63358 | test_accuracy: 0.16487 |  0:00:03s
epoch 5  | loss: 1.59346 | test_accuracy: 0.29606 |  0:00:03s
epoch 6  | loss: 1.54749 | test_accuracy: 0.30036 |  0:00:04s
epoch 7  | loss: 1.52815 | test_accuracy: 0.16774 |  0:00:04s
epoch 8  | loss: 1.51066 | test_accuracy: 0.17133 |  0:00:05s
epoch 9  | loss: 1.48425 | test_accuracy: 0.20502 |  0:00:05s
epoch 10 | loss: 1.47256 | test_accuracy: 0.25663 |  0:00:06s
epoch 11 | loss: 1.44544 | test_accuracy: 0.24373 |  0:00:06s
epoch 12 | loss: 1.42505 | test_accuracy: 0.24086 |  0:00:07s
epoch 13 | loss: 1.41948 | test_accuracy: 0.23584 |  0:00



In [39]:
tabnet = TabNetClassifier()

# Grid search
grid_search = GridSearchCV(
    estimator=tabnet,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=3,
    n_jobs=-1,
)

In [None]:
grid_search.fit(X_train.values, y_train.values)

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Predict and evaluate the best model
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate ROC AUC (multi-class)
y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class="ovr")
print(f"ROC AUC: {roc_auc:.4f}")

# Plot ROC curve
for i in range(y_test_binarized.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_proba[:, i])
    plt.plot(fpr, tpr, label=f"Class {i}")

plt.plot([0, 1], [0, 1], "k--", label="Random Guessing")
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid()
plt.show()

In [15]:
bucket_ranges: dict[int, float] = {i: (bin_edges[i+1] - bin_edges[i]) * distribution.iloc[i] for i in range(num_buckets)}

max_range = sum(bucket_ranges.values())
normalized_bucket_ranges = {k: (log(v + 1, 1.04) / log(max_range + 1, 15))/10 for k, v in bucket_ranges.items()}

print(normalized_bucket_ranges)

{0: 1.1535623043232344, 1: 0.30386355968395157, 2: 0.24079466477302297, 3: 0.17169998416893634, 4: 0.2315410500793531, 5: 0.2843019134413132, 6: 0.6648802859498883, 7: 4.784252008243373}


In [16]:
def get_penalty(t, p, bucket_ranges, bucket_range_relaxation:float, max_range:float, penalty_weight:float):
    distance = abs(t - p) * penalty_weight
    # Average the true and predicted bucket ranges
    avg_range = avg_range = (bucket_ranges[t] + bucket_ranges[p]) / (bucket_range_relaxation + bucket_ranges[t] + bucket_ranges[p])
    # Compute penalty proportional to distance and relative range
    # The larger the distance and avg_range, the greater the penalty.
    penalty = (distance * (avg_range / max_range))
    return penalty

def relaxed_accuracy_with_weighted_ranges(true_labels, pred_labels, bucket_ranges, min_partial_credit:float=0.08, bucket_range_relaxation:float=1.2, grace_threshold:float = 0.08, penalty_weight:float=1.5, unpack_labels:bool=False, verbose:bool=False):
    """
    Calculate a "relaxed accuracy" that gives:
    - Full credit (1.0) if prediction == true class.
    - Partial credit if prediction is incorrect, with the score decreasing
      as the class distance and bucket ranges increase.
    - A minimum partial credit is guaranteed for any incorrect prediction 
      so that, on average, relaxed accuracy >= strict accuracy.
    Parameters
    ----------
    true_labels : array-like of shape (n_samples,)
        True class labels.
    pred_labels : array-like of shape (n_samples,)
        Predicted class labels.
    bucket_ranges : dict[int, float]
        A dictionary mapping each class label to its weighted bucket range.
        A higher float value means a larger bucket range.
        
    Returns
    -------
    float
        The mean relaxed accuracy across all predictions.
    """

    assert 0 <= min_partial_credit <= 1, 'min_partial_credit has to between 0 and 1'
    
    max_range = max(bucket_ranges.values()) 
    min_partial_credit = 0.08

    grace_count = 0
    tot = 0
    
    scores = []
    for t, p in zip(true_labels, pred_labels):
        tot += 1
        if unpack_labels:
            p = p[0]

        penalty = get_penalty(t, p, bucket_ranges, bucket_range_relaxation, max_range, penalty_weight)

        score = max(1 - penalty, 0)
        if score < grace_threshold:
            score += min_partial_credit
            grace_count += 1
        
        scores.append(score)
    if verbose:
        print(f"Grace was given to {grace_count*100/tot}% of predictions.")
        
    return np.mean(scores)

In [51]:
def calculate_err_field(bucket_ranges:dict[int, float]):
    err_field = np.zeros((num_buckets, num_buckets))
    max_range = max(bucket_ranges.values()) 
    for i in range(num_buckets):
        for j in range(num_buckets):
            err_field[i][j] = get_penalty(i, j, bucket_ranges, 300, max_range=max_range, penalty_weight=100)
    return err_field

err_field = calculate_err_field(normalized_bucket_ranges)
err_field_tensor = torch.from_numpy(err_field).to('cuda')

In [52]:
def my_loss_fn(y_pred, y_true):
    softmax_pred = torch.nn.Softmax(dim=-1)(y_pred)  # Compute softmax on GPU
    pred = torch.argmax(softmax_pred, dim=1)         # Compute argmax on GPU

    # Perform the lookup directly on the GPU
    loss = err_field_tensor[y_true, pred]

    # Return the sum of the resulting tensor
    return torch.sum(loss)

In [57]:
tabnet = TabNetClassifier(
    optimizer_fn=AdamW,  # Use AdamW optimizer
    optimizer_params=dict(lr=0.01, weight_decay=1e-2),  # Specify AdamW parameters
    n_d=128,
    n_a=128,
    n_steps=32,
    gamma=1.5,
    lambda_sparse=1e-3,
    mask_type="sparsemax",
    device_name='cuda',
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,  # Scheduler function
    scheduler_params={
        'mode': 'min',        # Minimize the monitored metric
        'factor': 0.8,        # Reduce learning rate by a factor of 0.1
        'patience': 5,        # Number of epochs with no improvement after which learning rate will be reduced
        'verbose': True       # Print a message when the learning rate is reduced
    },
)


tabnet.fit(
    X_train.values, y_train.values,
    eval_set=[(X_test.values, y_test.values)],
    eval_name=['test'],
    eval_metric=['accuracy'],
    max_epochs=1000,
    loss_fn=my_loss_fn,
    patience=100,
    batch_size=256,
    virtual_batch_size=128)



epoch 0  | loss: 80.21245| test_accuracy: 0.14194 |  0:00:03s
epoch 1  | loss: 82.70682| test_accuracy: 0.13118 |  0:00:06s
epoch 2  | loss: 81.70358| test_accuracy: 0.12832 |  0:00:10s
epoch 3  | loss: 85.02529| test_accuracy: 0.11828 |  0:00:13s
epoch 4  | loss: 85.9856 | test_accuracy: 0.14265 |  0:00:17s
epoch 5  | loss: 80.8981 | test_accuracy: 0.11039 |  0:00:20s
epoch 6  | loss: 79.87609| test_accuracy: 0.12832 |  0:00:24s
epoch 7  | loss: 74.3758 | test_accuracy: 0.11828 |  0:00:27s
epoch 8  | loss: 74.69125| test_accuracy: 0.1147  |  0:00:31s
epoch 9  | loss: 77.6974 | test_accuracy: 0.1319  |  0:00:34s
epoch 10 | loss: 77.4005 | test_accuracy: 0.1233  |  0:00:37s
epoch 11 | loss: 79.32454| test_accuracy: 0.11613 |  0:00:41s
epoch 12 | loss: 79.79361| test_accuracy: 0.119   |  0:00:44s
epoch 13 | loss: 78.5486 | test_accuracy: 0.14624 |  0:00:48s
epoch 14 | loss: 78.68648| test_accuracy: 0.12473 |  0:00:51s
epoch 15 | loss: 77.26858| test_accuracy: 0.119   |  0:00:55s
epoch 16



In [58]:
tabnet.save_model("tabnet_model")

Successfully saved model at tabnet_model.zip


'tabnet_model.zip'