In [1]:
from src.dataset import LINKSData
from globals import Paths

data = LINKSData(
    f'{Paths.JOANNE_DIR}/labelled_abstracts_Cerise&Joanne.xlsx',
    taxtype='ambiti tecnici'
)

In [5]:
from src.zero_shooter import TaxZeroShot

# Load ZeroShooter model.
tax_zero_shooter = TaxZeroShot(
    data.tax_tree,
    label_thresholds_file=f'{Paths.SAVE_DIR}/label_alphas_AmbitiTecnici.json',
    freeze_zstc=False,
    freeze_usp=False
)

In [3]:
from src.few_shot.modeling import ExampleFewShot, FewShotTrainer

# Get data into ExampleFewShot format.
perc_training = 0.7
n_train = int(perc_training * data.n_data)
examples = [
    ExampleFewShot(
        text=data.abstracts[i],
        labels=[data.Y[j][i] for j in range(data.tax_depth)]
    )
    for i in range(data.n_data)
]
examples_train, examples_valid = examples[: n_train], examples[n_train:]

In [6]:
# Train few-shots.
labels_to_consider = data.labels_levels[0]
labels_train = set([example.labels[0] for example in examples_train])
fs_trainer = FewShotTrainer(
    labels_all=labels_to_consider,
    labels_train=labels_train
)
# Evaluate
# fs_trainer.evaluate(tax_zero_shooter, examples_valid)
lr_zstc = 1e-3
lr_usp = 0.1
n_epochs = 4
tax_zero_shooter = fs_trainer.train(tax_zero_shooter, examples_train,
                                    examples_valid, lr_zstc=lr_zstc,
                                    lr_usp=lr_usp, n_epochs=n_epochs)
res = fs_trainer.evaluate(tax_zero_shooter, examples_valid)
res['n_shots'] = len(examples_train)
res['lr_zstc'] = lr_zstc
res['lr_usp'] = lr_usp
res['n_epochs'] = n_epochs
res['freeze_zstc'] = True
res['freeze_usp'] = True

100%|██████████| 47/47 [00:11<00:00,  3.96it/s]
100%|██████████| 2/2 [00:24<00:00, 12.26s/it]
[17:22:06][modeling.py]- INFO: SEEN: prec: 0.250, rec: 0.188, f1: 0.199
[17:22:06][modeling.py]- INFO: UNSEEN: prec: 0.000, rec: 0.000, f1: 0.000


In [7]:
labels_train

{'Biotechnology and Biosystems Engineering',
 'Computer Science',
 'Earth System Science',
 'Environmental Biology, Ecology and Evolution',
 'Human Mobility, Environment, and Space',
 'Immunity, Infection and Immunotherapy',
 'Institutions, Governance and Legal Systems',
 'Integrative Biology: from Genes and Genomes to Systems',
 'Materials Engineering',
 'Prevention, Diagnosis and Treatment of Human Diseases',
 'Products and Processes Engineering',
 'Sciences and Humanities',
 'Systems and Communication Engineering',
 'The Social World and Its Diversity'}

In [8]:
[x for x in labels_to_consider if x not in labels_train]

['The Study of the Human Past',
 'The Human Mind and Its Complexity',
 'Mathematics',
 'Physiology in Health, Disease and Ageing',
 'Cellular, Developmental and Regenerative Biology',
 'Physical and Analytical Chemical Sciences',
 'Neuroscience and Disorders of the Nervous System',
 'Universe Sciences',
 'Computer Science and Informatics',
 'Synthetic Chemistry and Materials',
 'Individuals, Markets and Organisations',
 'Cultures and Cultural Production',
 'Fundamental Constituents of Matter',
 'Molecules of Life: Biological Mechanisms, Structures and Functions',
 'Condensed Matter Physics']