### Загружаем модель для детекции уклончивости с huggingface

[link for english model](https://huggingface.co/alenaa/evasiveness)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("alenaa/evasiveness")
model = AutoModelForSequenceClassification.from_pretrained("alenaa/evasiveness")

In [2]:
tokenizer.save_pretrained("local-pt-checkpoint", safe_serialization=False)  
model.save_pretrained("local-pt-checkpoint", safe_serialization=False)

### Конвертируем в формат onnx

In [None]:
pip install 'transformers[onnx]'

In [None]:
!pip install onnxruntime_gpu

In [None]:
import subprocess  

subprocess.run(f"python -m transformers.onnx --model=local-pt-checkpoint --feature=sequence-classification onnx/".split())

In [6]:
import onnxruntime 

onnx_session = onnxruntime.InferenceSession(
    'onnx/model.onnx',
    providers=['CUDAExecutionProvider'])

[0;93m2024-05-30 16:03:44.858964400 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-05-30 16:03:44.858987186 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


### Тестируем модели

Предсказываем с помощью ONNX модели

In [7]:
import numpy as np
import torch 
import time 
import tracemalloc

def predict_onnx(feed):
    start_time = time.time()
    tracemalloc.start()
    output = onnx_session.run(None, feed)
    mem = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    end_time = time.time() - start_time
    output_tensor = torch.tensor(output[0], dtype=torch.float32)
    softmax_output = torch.nn.functional.softmax(output_tensor, dim=-1)
    predictions = np.squeeze(softmax_output.numpy())
    return(np.argmax(predictions, axis=1)), end_time, mem


  from .autonotebook import tqdm as notebook_tqdm


Предсказываем с помощью обычной модели

In [8]:
def predict(inputs):
    model.eval()
    with torch.no_grad():
        tracemalloc.start()
        start_time = time.time()
        outputs = model(**inputs)
        end_time = time.time() - start_time
        mem = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        logits = outputs.logits
        
        predictions = torch.argmax(logits, dim=-1)
        
        return predictions, end_time, mem

Проверим точность и время на сгенерированном датасете

In [9]:
import pandas as pd

data = pd.read_csv('english_data_200.csv', index_col = 0)
Q = data.Question.values.tolist()
A = data.Answer.values.tolist()

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("local-pt-checkpoint")
model = AutoModelForSequenceClassification.from_pretrained("local-pt-checkpoint")



In [11]:
inputs = tokenizer(Q, A, truncation=True, padding=True, max_length=512, return_tensors="pt")
feed = dict(
        input_ids=np.array(inputs["input_ids"]).astype("int64"),
        attention_mask=np.array(inputs["attention_mask"]).astype("int64"))

In [12]:
from sklearn.metrics import accuracy_score

test = data.Label.values.tolist()

def print_results(prediction_func, inp):
    test = data.Label.values.tolist()
    preds, time_, mem = prediction_func(inp)
    mse = accuracy_score(test, preds.tolist())
    if prediction_func == predict:
        model_type = 'Simple Model'
    else:
        model_type = 'ONNX Model'
    print(f'{model_type}')
    print('MSE: ', mse)
    print('Time: ', time_)
    print('Memory: ', mem)

In [13]:
print_results(predict, inputs)

Simple Model
MSE:  0.7548076923076923
Time:  0.928778886795044
Memory:  (679097, 706469)


In [14]:
print_results(predict_onnx, feed)

ONNX Model
MSE:  0.7548076923076923
Time:  1.4862103462219238
Memory:  (4698, 7312)


Попробуем оптимизировать модель onnx

In [None]:
!pip install onnxruntime_tools

In [15]:
from onnxruntime_tools import optimizer

optimized_model = optimizer.optimize_model("onnx/model.onnx", model_type='bert')
optimized_model.save_model_to_file("onnx/opt_model.onnx")
onnx_session = onnxruntime.InferenceSession(
    'onnx/opt_model.onnx',
    providers=['CUDAExecutionProvider'])
print_results(predict_onnx, feed)


ONNX Model
MSE:  0.7548076923076923
Time:  0.07584977149963379
Memory:  (1993, 2080)


[0;93m2024-05-30 16:04:24.539675713 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-05-30 16:04:24.539696471 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
