In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install evaluate

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
import numpy as np
import pandas as pd
import torch
import evaluate
import matplotlib.pyplot as plt

from tensorflow.python.summary.summary_iterator import summary_iterator
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from scipy.special import expit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tensorflow.python.summary.summary_iterator import summary_iterator
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
np.random.seed(42)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/УИР/Интенциональности.csv")
def replace_wrong(s):
  if s == 'Воволечь в диалог':
    return 'Вовлечь в диалог'
  return s

df['label'] = df['label'].apply(replace_wrong)
df.to_csv('/content/drive/MyDrive/УИР/intents.csv', index=None)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/УИР/intents.csv')
df.head()

Unnamed: 0,text,label
0,Что ты думаешь о последних событиях? Хотелось ...,Вовлечь в диалог
1,"Мне интересно, как ты видишь эту ситуацию. Мож...",Вовлечь в диалог
2,Давай обсудим наши планы на следующий месяц. Ч...,Вовлечь в диалог
3,"Я заметил, что ты умеешь решать эту проблему. ...",Вовлечь в диалог
4,"Послушай, у меня возникло несколько идей по ул...",Вовлечь в диалог


In [None]:
labels = df.label.unique()
id = range(23)

labels2id = dict(zip(labels, id))
id2labels = dict(zip(id, labels))

In [None]:
def split_data(df, label):
  df['label'] = df['label'].apply(lambda x: 1 if x == label else 0)
  true_labels = df[df['label'] == 1]
  false_labels = df[df['label'] == 0].sample(len(true_labels)*2, random_state=42)
  data = pd.concat([true_labels, false_labels], ignore_index=True)
  return train_test_split(data['text'].values, data['label'].values, test_size=0.25, random_state=42, stratify=data['label'].values)


In [None]:
class TorchSet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
metrics = {
    'label':[],
    'accuracy': [],
    'f1': [],
    'roc_auc': [],
    'recall': [],
    'precision': []
}
done = []
for label in labels:
  model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
  X_train, X_test, y_train, y_test = split_data(df.copy(), label)
  train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
  test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
  train_dataset = TorchSet(train_encodings, y_train)
  test_dataset = TorchSet(test_encodings, y_test)
  training_args = TrainingArguments(
      output_dir=f'/content/drive/MyDrive/УИР/intent_models/{label}',
      num_train_epochs=7,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=64,
      warmup_steps=500,
      weight_decay=0.01,
      evaluation_strategy="steps",
      eval_steps=5,
      logging_dir=f'/content/drive/MyDrive/УИР/intent_logs/{label}',
      logging_steps=5,
      load_best_model_at_end=True
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
  )

  trainer.train()
  trainer.save_model(f'/content/drive/MyDrive/УИР/intent_models/{label}')
  done.append(f"/content/drive/MyDrive/УИР/intent_models/{label}")

  accuracy = evaluate.load("accuracy")
  f1 = evaluate.load("f1")
  roc_auc = evaluate.load("roc_auc")
  recall = evaluate.load('recall')
  precision = evaluate.load("precision")

  predictions = trainer.predict(test_dataset)
  preds = np.argmax(predictions.predictions, axis=-1)

  metrics['label'].append(label)
  metrics['accuracy'].append(list(accuracy.compute(predictions=preds, references=predictions.label_ids).values()))
  metrics['f1'].append(list(f1.compute(predictions=preds, references=predictions.label_ids).values()))
  metrics['roc_auc'].append(list(roc_auc.compute(prediction_scores=preds, references=predictions.label_ids).values()))
  metrics['recall'].append(list(recall.compute(predictions=preds, references=predictions.label_ids).values()))
  metrics['precision'].append(list(precision.compute(predictions=preds, references=predictions.label_ids).values()))

  train_loss = []
  eval_loss = []
  step = 0

  for e in trainer.state.log_history:
    if 'loss' in e.keys():
      train_loss.append(e['loss'])
    if 'eval_loss' in e.keys():
      eval_loss.append(e['eval_loss'])
    step = max(step, e['step'])

  steps = np.arange(5, step+1, 5)
  plt.plot(steps, train_loss, '-b', label='train loss')
  plt.plot(steps, eval_loss, '-r', label='test loss')

  plt.xlabel("step")
  plt.ylabel("loss")

  plt.legend()

  plt.savefig(f'/content/drive/MyDrive/УИР/Plots/{label.replace(" ", "_")}.png')
  plt.clf()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6612,0.677312
10,0.6832,0.672595
15,0.6768,0.664552
20,0.6676,0.653289
25,0.6731,0.639149
30,0.6243,0.619061
35,0.6201,0.591157
40,0.5827,0.562035
45,0.5885,0.529965
50,0.5572,0.489101


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6973,0.68507
10,0.6843,0.682962
15,0.6838,0.678466
20,0.6706,0.671719
25,0.6747,0.664609
30,0.6583,0.656671
35,0.653,0.649073
40,0.6453,0.642977
45,0.6307,0.637817
50,0.6495,0.633711


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.686,0.687935
10,0.6829,0.682819
15,0.6813,0.675345
20,0.6926,0.668019
25,0.6704,0.660775
30,0.6473,0.654225
35,0.6364,0.646288
40,0.6631,0.638322
45,0.6274,0.632684
50,0.6319,0.624847


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6863,0.685492
10,0.6798,0.680986
15,0.6807,0.674199
20,0.6814,0.666971
25,0.6769,0.660142
30,0.6507,0.652126
35,0.653,0.644478
40,0.6545,0.636657
45,0.6501,0.629419
50,0.6135,0.618958


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6808,0.680178
10,0.677,0.676787
15,0.6733,0.670895
20,0.6953,0.664746
25,0.6575,0.658218
30,0.6414,0.649727
35,0.649,0.641836
40,0.6321,0.63436
45,0.6419,0.627678
50,0.6457,0.618246


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6792,0.680848
10,0.6795,0.676836
15,0.6779,0.670051
20,0.6665,0.661763
25,0.6577,0.655446
30,0.642,0.648043
35,0.6752,0.642333
40,0.6388,0.636415
45,0.6659,0.630302
50,0.6342,0.622872


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6889,0.687521
10,0.6935,0.683825
15,0.6825,0.677544
20,0.6733,0.669515
25,0.6548,0.660411
30,0.6553,0.650505
35,0.6629,0.640899
40,0.6467,0.631207
45,0.5886,0.619644
50,0.637,0.606


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6831,0.680845
10,0.6824,0.677313
15,0.6679,0.670432
20,0.6663,0.661857
25,0.6781,0.653662
30,0.6347,0.644313
35,0.6375,0.634102
40,0.6508,0.625397
45,0.6217,0.616416
50,0.6043,0.60724


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6679,0.661537
10,0.6592,0.657588
15,0.6402,0.650707
20,0.645,0.641935
25,0.6764,0.63211
30,0.6131,0.619544
35,0.6573,0.602243
40,0.6005,0.576706
45,0.5555,0.541561
50,0.5287,0.491092


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6924,0.68523
10,0.6883,0.681612
15,0.6859,0.674436
20,0.6618,0.665769
25,0.6559,0.657903
30,0.6694,0.652278
35,0.6372,0.646383
40,0.6555,0.640561
45,0.611,0.635183
50,0.5902,0.628954


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6868,0.685017
10,0.682,0.679448
15,0.6832,0.671713
20,0.6798,0.664455
25,0.6652,0.654162
30,0.6683,0.644265
35,0.6214,0.630832
40,0.6455,0.615238
45,0.5856,0.600406
50,0.5853,0.584401


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6708,0.664519
10,0.6588,0.66191
15,0.6588,0.657432
20,0.6483,0.652317
25,0.6503,0.647227
30,0.6736,0.642401
35,0.6167,0.637807
40,0.6466,0.632801
45,0.6486,0.629631
50,0.6237,0.626059


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6809,0.683843
10,0.6899,0.679833
15,0.6858,0.674293
20,0.686,0.66951
25,0.668,0.662447
30,0.6372,0.653767
35,0.6362,0.643966
40,0.6153,0.635792
45,0.6871,0.631365
50,0.6423,0.625998


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6776,0.671633
10,0.6672,0.66718
15,0.6758,0.659475
20,0.6472,0.650318
25,0.6483,0.640544
30,0.6588,0.629971
35,0.6064,0.615948
40,0.6217,0.600608
45,0.612,0.583601
50,0.5885,0.565212


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6881,0.684955
10,0.6765,0.680671
15,0.679,0.675883
20,0.6717,0.670517
25,0.6848,0.663961
30,0.655,0.655434
35,0.6587,0.647574
40,0.6707,0.642545
45,0.6776,0.638373
50,0.6263,0.632953


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6873,0.68333
10,0.6804,0.679359
15,0.675,0.672796
20,0.6643,0.665202
25,0.6686,0.657889
30,0.6778,0.651022
35,0.6354,0.64375
40,0.6506,0.635467
45,0.6514,0.626385
50,0.6247,0.613596


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6874,0.687047
10,0.6831,0.683864
15,0.6708,0.67712
20,0.6856,0.668575
25,0.6546,0.658653
30,0.6397,0.647657
35,0.6377,0.636842
40,0.6496,0.627704
45,0.6609,0.617382
50,0.5925,0.602679


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.703,0.698095
10,0.6989,0.69275
15,0.6867,0.683479
20,0.6618,0.671754
25,0.6683,0.658583
30,0.6541,0.645379
35,0.6658,0.635873
40,0.6286,0.626923
45,0.6299,0.616533
50,0.6378,0.601278


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6668,0.674343
10,0.6729,0.671048
15,0.6882,0.665989
20,0.6617,0.660723
25,0.676,0.655527
30,0.6866,0.652492
35,0.6319,0.647318
40,0.5904,0.638076
45,0.6576,0.632515
50,0.647,0.629104


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6566,0.66267
10,0.6528,0.660045
15,0.6732,0.65707
20,0.654,0.653373
25,0.6488,0.648864
30,0.6447,0.643754
35,0.65,0.638398
40,0.6289,0.632252
45,0.6743,0.62554
50,0.6076,0.614957


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.674,0.680305
10,0.6841,0.676706
15,0.6822,0.671067
20,0.6503,0.663437
25,0.6553,0.654965
30,0.6624,0.648388
35,0.6004,0.640537
40,0.6871,0.635445
45,0.6267,0.632934
50,0.6286,0.629929


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6882,0.688077
10,0.6852,0.684145
15,0.6854,0.677947
20,0.672,0.669977
25,0.6637,0.661204
30,0.6657,0.65321
35,0.6166,0.643372
40,0.6738,0.634743
45,0.6323,0.629498
50,0.6232,0.625643


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
5,0.6872,0.688425
10,0.6928,0.684533
15,0.6862,0.678948
20,0.6742,0.671824
25,0.671,0.663125
30,0.6724,0.654469
35,0.6337,0.645307
40,0.6423,0.636156
45,0.653,0.629227
50,0.6201,0.619392


<Figure size 640x480 with 0 Axes>

In [None]:
def to_int(x):
  if type(x)==list:
    return int(x)
  return x
met = pd.DataFrame(metrics).apply(lambda x:x[0] if type(x)==list else x)

In [None]:
met.accuracy = met.accuracy.apply(lambda x:x[0] if type(x)==list else x)
met.f1 = met.f1.apply(lambda x:x[0] if type(x)==list else x)
met.roc_auc = met.roc_auc.apply(lambda x:x[0] if type(x)==list else x)
met.recall = met.recall.apply(lambda x:x[0] if type(x)==list else x)
met.precision = met.precision.apply(lambda x:x[0] if type(x)==list else x)
met.to_csv('/content/drive/MyDrive/УИР/intent_models/metrics.csv', index=None)