In [35]:
import psycopg2
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import label_ranking_average_precision_score
from simpletransformers.classification import ClassificationArgs

In [5]:
con = psycopg2.connect(
  database="db_dsa", 
  user="udsa", 
  password="P@ssword2021", 
  host="92.242.58.173", 
  port="1984"
)

In [6]:
cur = con.cursor()
cur.execute("SELECT * from PROJECTS")
rows = cur.fetchall()
column_names=["ID","NUMBER","VACANCIES","NAME","TYPE","TYPEDESC","TYPEID","STATUSID", "STATUSDESC","DIRECTIONHEAD","HEAD","DATACREATED","FACULTYID"]
df = pd.DataFrame(rows, columns=column_names)
df.head()

Unnamed: 0,ID,NUMBER,VACANCIES,NAME,TYPE,TYPEDESC,TYPEID,STATUSID,STATUSDESC,DIRECTIONHEAD,HEAD,DATACREATED,FACULTYID
0,205,,2,Численное и аналитическое исследование эффекта...,nir,НИР,1,1,Готов к работе,Данилов Владимир Григорьевич,Данилов Владимир Григорьевич,08.05.2020 21:25:02,
1,208,,3,"Исследование механизмов образования, способов ...",nir,НИР,1,1,Готов к работе,,Грачев Николай,11.05.2020 14:07:40,
2,215,,1,Разработка моста переменного тока,soft-hard,Прогр-аппарат.,3,1,Готов к работе,,Балакин Станислав,13.05.2020 16:41:39,
3,216,,3,Разработка нательных антенн и излучателей для ...,soft-hard,Прогр-аппарат.,3,1,Готов к работе,Елизаров Андрей Альбертович,Скуридин Андрей,13.05.2020 17:00:19,
4,218,,0,Разработка глоссария физических терминов для м...,soft,Прогр.,2,1,Готов к работе,,Гузенкова Александра Сергеевна,13.05.2020 19:38:43,


In [7]:
papers = pd.DataFrame({
    'title': df['NAME'].tolist(),
    'categories': df['TYPE'].tolist()
})

papers['title'] = papers['title'].apply(lambda x: x.replace("\n",""))
papers['title'] = papers['title'].apply(lambda x: x.strip())
papers['text'] = papers['title']

papers['categories'] = papers['categories'].apply(lambda x: tuple(x.split()))

shortlisted_categories = papers['categories'].value_counts().reset_index(name="count").query("count > 1")["index"].tolist()
papers = papers[papers["categories"].isin(shortlisted_categories)].reset_index(drop=True)

papers = papers.sample(frac=1).reset_index(drop=True)

papers = papers.groupby('categories').head(250).reset_index(drop=True)

multi_label_encoder = MultiLabelBinarizer()
multi_label_encoder.fit(papers['categories'])
papers['categories_encoded'] = papers['categories'].apply(lambda x: multi_label_encoder.transform([x])[0])

papers = papers[["text", "categories", "categories_encoded"]]
papers.head()

Unnamed: 0,text,categories,categories_encoded
0,Программно-аппаратный комплекс для распознаван...,"(soft-hard,)","[0, 0, 1]"
1,Равновесные конфигурации точечных вихрей на пл...,"(nir,)","[1, 0, 0]"
2,Мобильное приложение МИЭМ,"(soft,)","[0, 1, 0]"
3,Студия самозаписи,"(soft,)","[0, 1, 0]"
4,Разработка системы VR-визуализации для центра ...,"(soft,)","[0, 1, 0]"


In [8]:
train, test = train_test_split(papers, test_size=0.3, stratify=papers['categories'], random_state=42)

In [60]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-8, 1e-4, log = True)
    adam_epsilon = trial.suggest_float("adam_epilson", 1e-8, 1e-4, log = True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)

    model_args = {
        'sliding_window':False,
        'learning_rate':learning_rate,
        'adam_epsilon':adam_epsilon,
        'train_batch_size':8,
        'eval_batch_size':4,
        'num_train_epochs':num_train_epochs,
        'do_lower_case':False,
        'max_seq_length':512,
        'overwrite_output_dir':True
    }
    model = MultiLabelClassificationModel('xlnet', 
                                      'xlnet-base-cased', 
                                      num_labels=len(shortlisted_categories), 
                                      args=model_args, use_cuda=True)

    model.train_model(train[['text', 'categories_encoded']])
    result, model_outputs, wrong_predictions = model.eval_model(test[['text', 'categories_encoded']])
    return result['LRAP']

In [61]:
study = optuna.create_study(study_name="xlnetmodel")
study.optimize(objective, n_trials=10)

[32m[I 2022-06-18 14:52:32,602][0m A new study created in memory with name: xlnetmodel[0m


Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream tas

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 14:53:34,357][0m Trial 0 finished with value: 0.7388316151202746 and parameters: {'learning_rate': 3.742331728111638e-05, 'adam_epilson': 3.6466366724719046e-05, 'num_train_epochs': 1}. Best is trial 0 with value: 0.7388316151202746.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 14:54:41,818][0m Trial 1 finished with value: 0.7439862542955324 and parameters: {'learning_rate': 3.1969535426092255e-05, 'adam_epilson': 6.59473176446127e-07, 'num_train_epochs': 2}. Best is trial 0 with value: 0.7388316151202746.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were 

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 14:56:17,788][0m Trial 2 finished with value: 0.7439862542955324 and parameters: {'learning_rate': 5.2037093636245117e-05, 'adam_epilson': 3.1968296779076815e-05, 'num_train_epochs': 3}. Best is trial 0 with value: 0.7388316151202746.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification wer

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 14:57:53,244][0m Trial 3 finished with value: 0.6237113402061857 and parameters: {'learning_rate': 1.1037698100854913e-08, 'adam_epilson': 1.5512538265188066e-08, 'num_train_epochs': 3}. Best is trial 3 with value: 0.6237113402061857.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification wer

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 14:59:28,441][0m Trial 4 finished with value: 0.7388316151202746 and parameters: {'learning_rate': 1.1607446564559237e-06, 'adam_epilson': 8.368964713875688e-05, 'num_train_epochs': 3}. Best is trial 3 with value: 0.6237113402061857.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 15:01:03,743][0m Trial 5 finished with value: 0.7439862542955324 and parameters: {'learning_rate': 7.070097788420759e-06, 'adam_epilson': 6.76900787207264e-07, 'num_train_epochs': 3}. Best is trial 3 with value: 0.6237113402061857.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were n

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 15:02:39,714][0m Trial 6 finished with value: 0.5704467353951894 and parameters: {'learning_rate': 1.553032690415717e-07, 'adam_epilson': 7.412643971702729e-07, 'num_train_epochs': 3}. Best is trial 6 with value: 0.5704467353951894.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification were 

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 15:03:46,646][0m Trial 7 finished with value: 0.6941580756013747 and parameters: {'learning_rate': 2.1847247175039256e-08, 'adam_epilson': 1.0137859065557214e-08, 'num_train_epochs': 2}. Best is trial 6 with value: 0.5704467353951894.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification wer

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 15:04:57,709][0m Trial 8 finished with value: 0.7439862542955324 and parameters: {'learning_rate': 1.1589031338307614e-07, 'adam_epilson': 2.5774128834795863e-05, 'num_train_epochs': 2}. Best is trial 6 with value: 0.5704467353951894.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForMultiLabelSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForMultiLabelSequenceClassification wer

  0%|          | 0/225 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[I 2022-06-18 15:05:36,507][0m Trial 9 finished with value: 0.5756013745704471 and parameters: {'learning_rate': 1.6008321914659737e-08, 'adam_epilson': 9.02504588469839e-07, 'num_train_epochs': 1}. Best is trial 6 with value: 0.5704467353951894.[0m


In [62]:
print("Best Params : {}".format(study.best_params))

print("Best LRAP : {}".format(study.best_value))

Best Params : {'learning_rate': 1.553032690415717e-07, 'adam_epilson': 7.412643971702729e-07, 'num_train_epochs': 3}
Best LRAP : 0.5704467353951894


In [63]:
study.best_trial

FrozenTrial(number=6, values=[0.5704467353951894], datetime_start=datetime.datetime(2022, 6, 18, 15, 1, 3, 746793), datetime_complete=datetime.datetime(2022, 6, 18, 15, 2, 39, 713731), params={'learning_rate': 1.553032690415717e-07, 'adam_epilson': 7.412643971702729e-07, 'num_train_epochs': 3}, distributions={'learning_rate': LogUniformDistribution(high=0.0001, low=1e-08), 'adam_epilson': LogUniformDistribution(high=0.0001, low=1e-08), 'num_train_epochs': IntUniformDistribution(high=3, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=6, state=TrialState.COMPLETE, value=None)