This notebook was executed in Google Colab using A100-GPU

### Start of execution

In [1]:
import time

In [2]:
start = time.time()

# 1. Setting the environment

In [3]:
!pip install -q datasets==2.20.0

# 2. Import libraries

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 3. Preparation

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
path_general = 'drive/MyDrive/Profesional_Academico/Github_Personal/ML_AI_Contents/09.Deep_Learning/52.BETO_HF_Trainer'

# 3. Load Dataset

In [8]:
df_train = pd.read_csv(f'{path_general}/data/df_train.csv')

In [9]:
df_test = pd.read_csv(f'{path_general}/data/df_test.csv')

# 4. Load model

In [11]:
path_model = f'{path_general}/model'

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = path_model,
    device_map = 'cuda',
    offload_folder = path_model
    )

In [13]:
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

# 5. Inference

In [14]:
def predict_text(text, tokenizer):

  encoded_input = tokenizer(text, return_tensors='pt', truncation = True).to('cuda')
  output = model(**encoded_input)
  logits_array = output.logits.to('cpu').detach().numpy()[0]

  if logits_array[0] > logits_array[1]:
    return 0

  else:
    return 1

In [15]:
df_train['pred'] = df_train['text'].map(lambda x: predict_text(x, tokenizer))

In [16]:
df_test['pred'] = df_test['text'].map(lambda x: predict_text(x, tokenizer))

# 6. Performance

In [17]:
from sklearn.metrics import recall_score, precision_score, f1_score

### a. Train

In [18]:
round(precision_score(df_train['label'], df_train['pred'], average = 'macro')*100, 2)

92.74

In [19]:
round(recall_score(df_train['label'], df_train['pred'], average = 'macro')*100, 2)

92.81

In [20]:
round(f1_score(df_train['label'], df_train['pred'], average = 'macro')*100, 2)

92.75

In [21]:
round(((df_train['label'] == df_train['pred'])*1).mean()*100, 2)

92.76

### b. Test

In [22]:
round(precision_score(df_test['label'], df_test['pred'], average = 'macro')*100, 2)

67.23

In [23]:
round(recall_score(df_test['label'], df_test['pred'], average = 'macro')*100, 2)

67.53

In [24]:
round(f1_score(df_test['label'], df_test['pred'], average = 'macro')*100, 2)

67.26

In [25]:
round(((df_test['label'] == df_test['pred'])*1).mean()*100, 2)

67.61

### End of execution

In [26]:
end = time.time()

delta = (end - start)

hours = int(delta/3_600)
mins = int((delta - hours*3_600)/60)
secs = int(delta - hours*3_600 - mins*60)

print(f'Hours: {hours}, Minutes: {mins}, Seconds: {secs}')

Hours: 0, Minutes: 1, Seconds: 50
