In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig
from sympy.printing.pytorch import torch
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModel, ORTQuantizer, ORTOptimizer
from pyarrow import parquet

In [2]:
df_filtered = pd.read_csv("data/amazon_review_xgboost.csv")

In [3]:
from datasets import Dataset
dataset_with_rating_bins = Dataset.from_pandas(df_filtered, preserve_index=False)
dataset_with_rating_bins

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'embedding_text', 'review_word_count', 'title_word_count', '__index_level_0__', 'rating_bins', 'total_review_word_count'],
    num_rows: 636206
})

In [5]:
dataset_with_rating_bins.features

{'rating': Value(dtype='float64', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'images': Value(dtype='string', id=None),
 'asin': Value(dtype='string', id=None),
 'parent_asin': Value(dtype='string', id=None),
 'user_id': Value(dtype='string', id=None),
 'timestamp': Value(dtype='int64', id=None),
 'helpful_vote': Value(dtype='int64', id=None),
 'verified_purchase': Value(dtype='bool', id=None),
 'embedding_text': Value(dtype='string', id=None),
 'review_word_count': Value(dtype='int64', id=None),
 'title_word_count': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'rating_bins': Value(dtype='int64', id=None),
 'total_review_word_count': Value(dtype='int64', id=None)}

In [None]:
#region
split_dataset = dataset_with_rating_bins.train_test_split(test_size=0.2, shuffle=True, stratify_by_column="rating_bins")
train_set = split_dataset["train"],
train_set = train_set[0]
test_validation_split = split_dataset["test"].train_test_split(test_size=0.5, shuffle=True,
                                                               stratify_by_column="rating_bins")
validation_set = test_validation_split["train"]
test_set = test_validation_split["test"]
print(train_set)
print(test_set)
print(validation_set)
# endregion

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")
chkpt = "google-bert/bert-base-uncased"
onx_model_path = "onx_model/bert-base-uncased/3/"

device : cuda


In [1]:
# convert the model to ONNX, the result is model.onnx
ort_model = ORTModel.from_pretrained(chkpt, export=True)
tokenizer = AutoTokenizer.from_pretrained(chkpt)
ort_model.save_pretrained(onx_model_path)
tokenizer.save_pretrained(onx_model_path)

# quantize the ONNX model, the result is model_quantized.onnx
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False)
quantizer = ORTQuantizer.from_pretrained(onx_model_path, file_name="model.onnx")
# Apply dynamic quantization on the model
quantizer.quantize(save_dir=onx_model_path, quantization_config=qconfig)

optim_config = OptimizationConfig(optimization_level=1)
optimizer = ORTOptimizer.from_pretrained(onx_model_path, file_names=["model_quantized.onnx"])
optimizer.optimize(save_dir=onx_model_path, optimization_config=optim_config)

NameError: name 'ORTModel' is not defined

In [6]:
# load the saved tokenizer and tokenize the text
tokenizer = AutoTokenizer.from_pretrained(onx_model_path)
def tokenize_text(batch):
    text = batch["embedding_text"]
    return tokenizer(text, padding="max_length", truncation=True, max_length=400)
dataset_tokenized = dataset_with_rating_bins.map(tokenize_text, batched=True)
# print(dataset_tokenized.features)
req_cols = ['input_ids', 'token_type_ids', 'attention_mask', 'rating']
removed_cols = [col for col in dataset_tokenized.column_names if col not in req_cols]
dataset_tokenized = dataset_tokenized.remove_columns(removed_cols)
dataset_tokenized.features

Map:   0%|          | 0/636206 [00:00<?, ? examples/s]

{'rating': Value(dtype='float64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
dataset_tokenized.set_format("arrow")
arrow_table = dataset_tokenized[:]
parquet.write_table(
    arrow_table,
    "data_tokenized.parquet",
    compression='snappy'
)

In [7]:
import onnxruntime
session = onnxruntime.InferenceSession(
    "onx_model/bert-base-uncased/3/model_quantized.onnx",
    providers=["CUDAExecutionProvider"]
)

In [2]:
def get_embdeddings(batch):
    inputs = {k: v for k, v in batch.items() if k in tokenizer.model_input_names}
    output_names = [output.name for output in session.get_outputs()]
    ort_outputs = session.run(
        output_names,
        inputs  
    )
    # The output is typically a list, where the first element is the hidden states
    last_hidden_state = ort_outputs[0]
    cls_embeddings = last_hidden_state[:, 0, :]
    return {"cls_embeddings": cls_embeddings}
dataset_tokenized.set_format(type='numpy', columns=['input_ids', 'attention_mask', 'token_type_ids'])
cls_hidden_state = dataset_tokenized.map(get_embdeddings, batched=True, batch_size=16)
print(cls_hidden_state)

NameError: name 'dataset_tokenized' is not defined

In [None]:
cls_hidden_state.set_format("numpy", columns=["cls_embeddings", "rating"])
expanded_array = np.expand_dims(cls_hidden_state["rating"], axis=0)
expanded_array.shape

In [None]:
stacked_embedding_rating = np.hstack((cls_hidden_state["cls_embeddings"], expanded_array.T))
stacked_embedding_rating.shape

In [None]:
np.save('data/bert_embeddings_ratings.npy', stacked_embedding_rating)
np.save('data/bert_only_embeddings.npy', cls_hidden_state["cls_embeddings"])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(cls_hidden_state["cls_embeddings"], cls_hidden_state["rating"],random_state=42, test_size=0.2, shuffle=True,
                                                    stratify=cls_hidden_state["rating"])

feature_cols = [f"emb_{i}" for i in range(x_train.shape[1])]
x_train_df = pd.DataFrame(data=x_train, columns=feature_cols)
y_train_df = pd.DataFrame(data=y_train, columns=["rating"])
train_df_combined = pd.concat([x_train_df, y_train_df], axis=1)
x_test_df = pd.DataFrame(data=x_test, columns=feature_cols)
y_test_df = pd.DataFrame(data=y_test, columns=["rating"])
test_df_combined = pd.concat([x_test_df, y_test_df], axis=1)
train_df_combined.to_parquet('train_data.parquet', index=False)
test_df_combined.to_parquet('val_test_data.parquet', index=False)

In [None]:
val_x, test_x, val_y, test_y = train_test_split(x_test, y_test, random_state=42, test_size=0.1, shuffle=True, stratify=y_test)
val_y.shape

In [None]:
val_x_df = pd.DataFrame(data=val_x, columns=feature_cols)
val_y_df = pd.DataFrame(data=val_y, columns=["rating"])
val_df_combined = pd.concat([val_x_df, val_y_df], axis=1)
val_df_combined.to_parquet('validation_data.parquet', index=False)
test_x_df = pd.DataFrame(data=test_x, columns=feature_cols)
test_y_df = pd.DataFrame(data=test_y, columns=["rating"])
test_df_combined = pd.concat([test_x_df, test_y_df], axis=1)
test_df_combined.to_parquet('test_data.parquet', index=False)