In [1]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import re
import json
import wandb

import os
import glob
from sklearn.model_selection import train_test_split

In [2]:
NAME = "musicians_with_persons-1"
DATASET = '../data/musicians_dataset'
OUTPUTS = './outputs'
EXPERIMENTS = '../experiments'
LABELS = {"B": "B-MUS", "PB": "B-PER", "I": "I-MUS", "PI": "I-PER", "O": "O"}

In [3]:
BATCH_SIZE = 16
EPOCHS = 1
ARGS = {
        "seed": 42,
        "labels_list": list(LABELS.values()),
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "train_batch_size": BATCH_SIZE,
        "eval_batch_size": BATCH_SIZE,
        "num_train_epochs": EPOCHS,
        "save_eval_checkpoints": False,
        "save_steps": -1,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 50,
        "evaluate_during_training_verbose": True,
        "fp16": False,
        "wandb_project": NAME,
        "learning_rate": 0.0003,
        "warmup_ratio": 0.1,
        "logging_steps": 1,
    }

In [4]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### data preparation

In [5]:
def ingest_data_to_df(filepath, label_map):
    tagged_data = []
    sentence_number = 0
    with open(filepath, 'r') as f:
        for line in f.readlines():
            for tagged_word in line.split():
                try:
                    word, tag = tagged_word.split("-[", 1)
                    old_tag = tag.split("]", 1)[0]
                    new_tag = label_map[old_tag]
                    tagged_data.append([sentence_number, word, new_tag])
                except Exception as e:
                    raise Exception((tagged_word, line), e)
            sentence_number += 1
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def get_dataset_parts(dataset_path):
    devpath, trainpath, testpath = f"{dataset_path}/dev.txt", f"{dataset_path}/train.txt", f"{dataset_path}/test.txt"
    train = ingest_data_to_df(trainpath, LABELS)
    test = ingest_data_to_df(testpath, LABELS)
    dev = ingest_data_to_df(devpath, LABELS)
    return dev, train, test

def untag_dev_sentences(devpath):
    output_path = f"sentences_{devpath}"
    clean_sentences = []
    with open(devpath, "r") as fin:
        for line in fin.readlines():
            sentence = re.sub('\-\[[A-Z]*\]',  '', line)
            clean_sentences.append(sentence)
    return clean_sentences

def split_train_set():
    trainpath = f'{DATASET}/train.txt'
    df = pd.read_csv(trainpath, sep='\t', names=['sentence'])
    sp_train, sp_dev = train_test_split(df, test_size=0.2)
    
    # this is significantly slower, but ensures the output is exactly like the input, with no extra escaping and quotes.
    with open(f'{DATASET}/split_train.txt', "w") as ft:
        for i, row in sp_train.iterrows():
            ft.write(row['sentence']+"\n")
    with open(f'{DATASET}/split_dev.txt', "w") as fd:
        for i, row in sp_dev.iterrows():
            fd.write(row['sentence']+"\n")

### train and evaluate with simpletransformers

In [6]:
def configure_model(args, model_type="roberta", model_name="roberta-base"):
    model = NERModel(
        model_type, 
        model_name, 
        args=args
    )
    return model

def send_prediction_to_conll(predictions, experiments_path):
    with open(f"{experiments_path}/predictions/{NAME}.conll", "w") as f:
        for sent in predictions:
            for token_dict in sent:
                for k, v in token_dict.items():
                    f.write(f"{k} {v}\n")

def document_results(experiments_path, experiment_name, outputs):
    details = dict()
    with open(f"{experiments_path}/{experiment_name}.json", "w") as f:
        with open(f"{outputs}/best_model/model_args.json","r") as args:
            details["model_args"] = json.load(args)
        with open(f"{outputs}/best_model/config.json","r") as conf:
            details["best_model_conf"] = json.load(conf)
        with open(f"{outputs}/best_model/eval_results.txt","r") as res:
            details["eval_results"] = dict()
            for r in res.readlines():
                k, v = r.split (" = ")
                details["eval_results"][k] = v
        json.dump(details, f)

        
def main():
    dev_set, train_set, test_set = get_dataset_parts(DATASET)
    model = configure_model(args=ARGS)
    model.train_model(train_set, eval_data=dev_set)
    clean_sentences = untag_dev_sentences(f"{DATASET}/dev.txt")
    predictions, raw_outputs = model.predict(clean_sentences)
    document_results(EXPERIMENTS, NAME, OUTPUTS)
    send_prediction_to_conll(predictions, EXPERIMENTS)


In [7]:
main()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/27760 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mshovals[0m (use `wandb login --relogin` to force relogin)


Running Epoch 0 of 1:   0%|          | 0/1735 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.18900670632719993, 'precision': 0.5166051660516605, 'recall': 0.5054151624548736, 'f1_score': 0.5109489051094891}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.2117839962244034, 'precision': 0.4458204334365325, 'recall': 0.51985559566787, 'f1_score': 0.48}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.16933393329381943, 'precision': 0.4006024096385542, 'recall': 0.48014440433212996, 'f1_score': 0.43678160919540227}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.2184601306915283, 'precision': 0.4186046511627907, 'recall': 0.4548736462093863, 'f1_score': 0.4359861591695502}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.19760058224201202, 'precision': 0.4365781710914454, 'recall': 0.5342960288808665, 'f1_score': 0.4805194805194805}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.19164901822805405, 'precision': 0.4891304347826087, 'recall': 0.48736462093862815, 'f1_score': 0.4882459312839059}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.2600663222372532, 'precision': 0.38622754491017963, 'recall': 0.4657039711191336, 'f1_score': 0.42225859247135844}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.2327643722295761, 'precision': 0.5, 'recall': 0.4981949458483754, 'f1_score': 0.49909584086799275}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.18252695426344873, 'precision': 0.46645367412140576, 'recall': 0.5270758122743683, 'f1_score': 0.49491525423728816}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.254746999591589, 'precision': 0.42045454545454547, 'recall': 0.5342960288808665, 'f1_score': 0.4705882352941177}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.21549150645732879, 'precision': 0.46875, 'recall': 0.37906137184115524, 'f1_score': 0.41916167664670656}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.2447565283626318, 'precision': 0.45482866043613707, 'recall': 0.5270758122743683, 'f1_score': 0.48829431438127097}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.17046978175640107, 'precision': 0.559322033898305, 'recall': 0.5956678700361011, 'f1_score': 0.576923076923077}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15923612788319588, 'precision': 0.558641975308642, 'recall': 0.6534296028880866, 'f1_score': 0.6023294509151415}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.16823448613286018, 'precision': 0.6118881118881119, 'recall': 0.631768953068592, 'f1_score': 0.6216696269982238}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.18439341634511947, 'precision': 0.49836065573770494, 'recall': 0.5487364620938628, 'f1_score': 0.5223367697594502}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1999668963253498, 'precision': 0.5, 'recall': 0.5234657039711191, 'f1_score': 0.5114638447971782}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.6209555894136429, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5216503947973251, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5044460117816925, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5111218631267548, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.49323450922966006, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5117915898561478, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5097633570432663, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.4994585007429123, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.508293554186821, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5049131840467453, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5143223494291306, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.49861293137073515, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5008940607309341, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.4992155760526657, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5012007802724838, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5005532443523407, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5015634924173356, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.5016241580247879, 'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
INFO:simpletransformers.ner.ner_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/10 [00:00<?, ?it/s]

In [13]:

sp_dev

Unnamed: 0,sentence
352,"He-[O] was-[O] a-[O] leader-[O] ,-[O] along-[O..."
6189,Best-[O] of-[O] the-[O] Soul-[O] Years-[O] is-...
24182,"However-[O] ,-[O] Leah-[PB] has-[O] kept-[O] s..."
19702,Robert-[B] James-[I] Smith-[I] (-[O] born-[O] ...
11918,In-[O] his-[O] early-[O] years-[O] in-[O] the-...
...,...
10713,She-[O] was-[O] the-[O] mother-[O] of-[O] the-...
16730,Temperley-[O] suffered-[O] relegation-[O] to-[...
26834,Highway-[O] 308-[O] begins-[O] at-[O] Highway-...
16425,"Founded-[O] in-[O] 1989-[O] ,-[O] by-[O] Frank..."


In [24]:
with open('../data/musicians_dataset/split_train.txt', "w") as ft:
    for i, row in sp_train.iterrows():
        ft.write(row['sentence']+"\n")
with open('../data/musicians_dataset/split_dev.txt', "w") as fd:
    for i, row in sp_dev.iterrows():
        fd.write(row['sentence']+"\n")
# sp_train['sentence'].to_csv('../data/musicians_dataset/split_train.txt', index=False, sep='\t', header=False)