https://medium.com/georgian-impact-blog/how-to-incorporate-tabular-data-with-huggingface-transformers-b70ac45fcfb4

In [None]:
#%pip install multimodal-transformers

In [None]:
import pandas as pd
from multimodal_transformers.data import load_data
from transformers import AutoTokenizer, EvalPrediction

import warnings
warnings.filterwarnings('ignore')

data_df = pd.read_csv('../data/clean_data/listings.csv')

data_df=data_df.dropna()

data_df.head()

import multimodal_transformers.data as d
help(d.load_data)

In [None]:
train=data_df.iloc[:200]

test=data_df.iloc[21250:]

train.shape, test.shape

train.to_csv('train.csv')
test.to_csv('test.csv')

In [None]:
text_cols = ['room_type', 'amenities']

label_col = 'price' 

categorical_cols = ['neighbourhood_cleansed', 'neighbourhood_group_cleansed']

numerical_cols = ['accommodates', 'bedrooms', 'beds']

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', truncation=True)

train_data = load_data(train, text_cols, tokenizer, categorical_cols=categorical_cols,
                       numerical_cols=numerical_cols, sep_text_token_str=tokenizer.sep_token,
                       label_col=label_col)


test_data = load_data(test, text_cols, tokenizer, categorical_cols=categorical_cols,
                       numerical_cols=numerical_cols, sep_text_token_str=tokenizer.sep_token,
                       label_col=label_col)

In [None]:
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig
from transformers import AutoConfig

config=AutoConfig.from_pretrained('bert-base-uncased')

tabular_config=TabularConfig(
    num_labels=1,
    cat_feat_dim=train_data.cat_feats.shape[1],
    numerical_feat_dim=train_data.numerical_feats.shape[1],
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
)
config.tabular_config = tabular_config

model = AutoModelWithTabular.from_pretrained('bert-base-uncased', config=config)

In [None]:
import torch

device=torch.device('mps')  # mps neural engine M1, cuda o cpu

In [None]:
from transformers import Trainer, TrainingArguments

args=TrainingArguments(output_dir='./logs/model_name',
                       logging_dir='./logs/runs',
                       overwrite_output_dir=True,
                       per_device_train_batch_size=32,
                       num_train_epochs=0.01,
                       evaluate_during_training=True,
                       logging_steps=25)



trainer=Trainer(model=model.to(device), 
                args=args, 
                train_dataset=train_data,
                eval_dataset=test_data,
                #compute_metrics=m()
               )

In [None]:
%%time

trainer.train()

In [None]:
trainer.evaluate(train_data)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


def metrics(preds, labels):

    mse = mean_squared_error(labels, preds)
    rmse = mean_squared_error(labels, preds, squared=False)
    mae = mean_absolute_error(labels, preds)

    return {'mse': mse,
            'rmse': rmse,
            'mae': mae}


In [None]:
%%time

y_pred=trainer.predict(test_dataset=train_data).predictions

In [None]:
#help(Trainer)

In [None]:
y_test=train.price.values

metrics(y_test, y_pred)

In [None]:
trainer.evaluate(eval_dataset=test_data)