# Tests

In [1]:
from _lib.trainer import DocumentEmbeddingType
from _lib.trainer import load_corpus, init_document_embedding, train_model, test


emb_types = [DocumentEmbeddingType.POOL, DocumentEmbeddingType.RNN]

In [4]:
for vec_fname in ['8', '8_g']:
    for emb_type in emb_types:
        test('8_typeoftrip_20', vec_fname, emb_type)

[32m[I 2022-05-26 03:51:09,419][0m Trial 49 finished with value: 0.5431769722814499 and parameters: {'optimizer': 'Adagrad', 'lr': 0.0061190284559545565, 'min_batch_size': 192, 'anneal_factor': 0.1, 'warmup_fraction': 0.6960886177574981, 'hidden_size': 390, 'rnn_layers': 2, 'bidirectional': False, 'dropout': 0.20492127866343263, 'rnn_type': 'LSTM'}. Best is trial 44 with value: 0.5708955223880597.[0m


params: {'optimizer': 'Adagrad', 'lr': 0.030104675841443735, 'min_batch_size': 64, 'anneal_factor': 0.1, 'warmup_fraction': 0.37239158129342714, 'hidden_size': 382, 'rnn_layers': 2, 'bidirectional': False, 'dropout': 0.1594257108834852, 'rnn_type': 'LSTM'}
value: 0.5708955223880597


In [5]:
for vec_fname in ['9', '9_g']:
    for emb_type in emb_types:
        test('9_typeoftrip_20', vec_fname, emb_type)

[32m[I 2022-05-27 07:39:09,305][0m Trial 49 finished with value: 0.5678571428571428 and parameters: {'optimizer': 'Adam', 'lr': 0.00020309470413567372, 'min_batch_size': 160, 'anneal_factor': 0.1, 'warmup_fraction': 0.3537305091984672, 'hidden_size': 352, 'rnn_layers': 2, 'bidirectional': True, 'dropout': 0.3616598713302082, 'rnn_type': 'GRU'}. Best is trial 41 with value: 0.5974489795918367.[0m


params: {'optimizer': 'Adam', 'lr': 0.0015753731379671806, 'min_batch_size': 160, 'anneal_factor': 0.1, 'warmup_fraction': 0.018745561493664156, 'hidden_size': 312, 'rnn_layers': 1, 'bidirectional': True, 'dropout': 0.3200865960265571, 'rnn_type': 'GRU'}
value: 0.5974489795918367


In [5]:
for vec_fname in ['10', '10_g']:
    for emb_type in emb_types:
        test('10_typeoftrip_20', vec_fname, emb_type)

[32m[I 2022-05-28 21:01:07,830][0m Trial 49 finished with value: 0.5058043117744611 and parameters: {'optimizer': 'SGD', 'lr': 0.0893505436811484, 'min_batch_size': 32, 'anneal_factor': 0.2, 'warmup_fraction': 0.024267567775304245, 'hidden_size': 190, 'rnn_layers': 5, 'bidirectional': False, 'dropout': 0.27912872014245704, 'rnn_type': 'GRU'}. Best is trial 46 with value: 0.5953565505804311.[0m


params: {'optimizer': 'Adam', 'lr': 0.0011428542164246822, 'min_batch_size': 32, 'anneal_factor': 0.2, 'warmup_fraction': 0.07554823779538865, 'hidden_size': 229, 'rnn_layers': 5, 'bidirectional': False, 'dropout': 0.37528418807671327, 'rnn_type': 'GRU'}
value: 0.5953565505804311


# BEst of the Best

## 8_typeoftrip

In [1]:
import pandas as pd

from _lib.helper import get_file_paths
from _lib.settings import DATA_FLAIR_TESTS_DIR, DATA_FLAIR_CORPUS_DIR, DATA_W2V_KEYED_VECTORS_DIR
from _lib.trainer import DocumentEmbeddingType
from _lib.trainer import load_corpus, init_document_embedding, train_model, test
from flair.embeddings import WordEmbeddings
from IPython.display import clear_output


emb_types = [DocumentEmbeddingType.POOL, DocumentEmbeddingType.RNN]

In [2]:
def get_best_runs(resolution, top):
    df_bfb_params = pd.DataFrame()

    for emb_type in emb_types:
        for fpath in get_file_paths(DATA_FLAIR_TESTS_DIR, includes=[emb_type, str(resolution)]):
        
            df = pd.read_csv(fpath)
            df.sort_values(['value'], ascending=False, inplace=True)
            df = df[:top]
            df = df[[column for column in df.columns if 'params' in column]]
            df['params_emb_type'] = emb_type
            df['params_graph'] = '_g_' in fpath
            df_bfb_params = df_bfb_params.append(df, ignore_index=True)
    
    return df_bfb_params


def run_final_tests(resolution, max_epochs, top=2):
    df_bfb_params = get_best_runs(resolution, top)
    
    df_bfb_params = df_bfb_params.sort_values(['params_emb_type'], ascending=False)

    for i, run in df_bfb_params.iterrows():

        corpus_dir = f'{resolution}_typeoftrip'
        project_name = f"final_{corpus_dir}"
        label_type = 'mylable'
        corpus, label_dict = load_corpus(f"{DATA_FLAIR_CORPUS_DIR}/{corpus_dir}", label_type)
        glove_embedding = WordEmbeddings(f"{DATA_W2V_KEYED_VECTORS_DIR}/{resolution}{'_g' if run['params_graph'] else ''}.kv")

        document_embeddings, embed_params = init_document_embedding(
            embedding_type=run['params_emb_type'],
            glove_embedding=glove_embedding,
            fine_tune_mode=run['params_fine_tune_mode'],
            pooling=run['params_pooling'],
            hidden_size=int(run['params_hidden_size']),
            rnn_layers=int(run['params_rnn_layers']),
            bidirectional=run['params_bidirectional'],
            dropout=run['params_dropout'],
            rnn_type=run['params_rnn_type'],
        )

        wandb_config = {
            'emb_type' : run['params_emb_type'],
            'graph' : run['params_graph'],
            'max_epochs': max_epochs,
            'learning_rate': run['params_lr'],
            'mini_batch_size': run['params_min_batch_size'],
            'optimizer': run['params_optimizer'],
            'anneal_factor': run['params_anneal_factor'],
            'warmup_fraction': run['params_warmup_fraction']
        }

        wandb_config.update(embed_params)
        
        result, trainer, model = train_model(
            project_name=project_name,
            wandb_config=wandb_config,
            document_embeddings=document_embeddings,
            label_type=label_type,
            label_dict=label_dict,
            corpus=corpus,
            max_epochs=max_epochs,
            learning_rate=run['params_lr'],
            mini_batch_size=run['params_min_batch_size'],
            optimizer=run['params_optimizer'],
            anneal_factor=run['params_anneal_factor'],
            warmup_fraction=run['params_warmup_fraction'],
            param_selection_mode=False
        )

        df_bfb_params.at[i, 'value'] = result['test_score']

        clear_output(wait=True)

    df_bfb_params.to_csv(f"{DATA_FLAIR_TESTS_DIR}/{project_name}.csv")

In [3]:
run_final_tests(8, 100)

2022-05-30 01:00:20,371 Reading data from /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/flair/corpus/8_typeoftrip
2022-05-30 01:00:20,372 Train: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/flair/corpus/8_typeoftrip/train.csv
2022-05-30 01:00:20,374 Dev: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/flair/corpus/8_typeoftrip/dev.csv
2022-05-30 01:00:20,375 Test: /media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/flair/corpus/8_typeoftrip/test.csv
2022-05-30 01:00:20,531 Computing label dictionary. Progress:


100%|██████████| 28146/28146 [00:07<00:00, 3867.04it/s]

2022-05-30 01:00:27,939 Corpus contains the labels: mylable (#28146)
2022-05-30 01:00:27,939 Created (for label 'mylable') Dictionary with 5 tags: <unk>, hometowork, leisure, other, hometoschool



[34m[1mwandb[0m: Currently logged in as: [33mbjuggler[0m. Use [1m`wandb login --relogin`[0m to force relogin


2022-05-30 01:00:37,438 ----------------------------------------------------------------------------------------------------
2022-05-30 01:00:37,439 Model: "TextClassifier(
  (loss_function): CrossEntropyLoss()
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings(
        '/media/yyeliseyenka/Zalman Data/lvrobi-gityy/data/word2vec/keyed_vectors/8.kv'
        (embedding): Embedding(33574, 50)
      )
    )
    (word_reprojection_map): Linear(in_features=50, out_features=50, bias=True)
    (rnn): GRU(50, 181, num_layers=3, batch_first=True)
    (dropout): Dropout(p=0.3216712931095337, inplace=False)
  )
  (decoder): Linear(in_features=181, out_features=5, bias=True)
  (weights): None
  (weight_tensor) None
)"
2022-05-30 01:00:37,440 ----------------------------------------------------------------------------------------------------
2022-05-30 01:00:37,441 Corpus: "Corpus: 28146 train + 9382 dev + 9381 test sentences



2022-05-30 01:00:39,697 epoch 1 - iter 11/110 - loss 0.00643392 - samples/sec: 1498.11 - lr: 0.009445
2022-05-30 01:00:41,511 epoch 1 - iter 22/110 - loss 0.00554513 - samples/sec: 1999.35 - lr: 0.009445
2022-05-30 01:00:43,371 epoch 1 - iter 33/110 - loss 0.00515484 - samples/sec: 1891.64 - lr: 0.009445
2022-05-30 01:00:45,343 epoch 1 - iter 44/110 - loss 0.00497453 - samples/sec: 2062.45 - lr: 0.009445
2022-05-30 01:00:47,005 epoch 1 - iter 55/110 - loss 0.00486850 - samples/sec: 2110.36 - lr: 0.009445
2022-05-30 01:00:48,847 epoch 1 - iter 66/110 - loss 0.00478727 - samples/sec: 1881.89 - lr: 0.009445
2022-05-30 01:00:50,615 epoch 1 - iter 77/110 - loss 0.00473282 - samples/sec: 1995.90 - lr: 0.009445
2022-05-30 01:00:52,540 epoch 1 - iter 88/110 - loss 0.00468157 - samples/sec: 1831.02 - lr: 0.009445
2022-05-30 01:00:54,418 epoch 1 - iter 99/110 - loss 0.00464144 - samples/sec: 1877.65 - lr: 0.009445
2022-05-30 01:00:56,263 epoch 1 - iter 110/110 - loss 0.00461206 - samples/sec: 19

In [None]:
run_final_tests(9, 100)

In [None]:
run_final_tests(10, 100)

In [None]:
run_final_tests