In [1]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import re
import json

In [4]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

dataset = './musicians_dataset'
outputs = './outputs'

In [5]:
def ingest_data_to_df(filepath, with_person=True):
    tagged_data = []
    sentence_number = 0
    with open(filepath, 'r') as f:
        for line in f.readlines():
            for tagged_word in line.split():
                word, tag = tagged_word.split("-[", 1)
                if tag.startswith('B'):
                    new_tag = "B-MUS"
                elif tag.startswith('I'):
                    new_tag = "I-MUS"
                elif with_person:
                    if tag.startswith('PB'):
                        new_tag = "B-PER"
                    elif tag.startswith('PI'):
                        new_tag = "I-PER"                
                    else:
                        new_tag = "O"
                else:
                    new_tag = "O"
                tagged_data.append([sentence_number, word, new_tag])
            sentence_number += 1
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def get_dataset_parts(dataset_path):
    devpath, trainpath, testpath = f"{dataset}/dev.txt", f"{dataset}/train.txt", f"{dataset}/test.txt"
    test = ingest_data_to_df(testpath)
    train = ingest_data_to_df(trainpath)
    dev = ingest_data_to_df(devpath)
    return dev, train, test

def untag_dev_sentences(devpath):
    output_path = f"sentences_{devpath}"
    clean_sentences = []
    with open(devpath, "r") as fin:
        for line in fin.readlines():
            sentence = re.sub('\-\[[A-Z]*\]',  '', line)
            clean_sentences.append(sentence)
    return clean_sentences


def configure_model(labels=None, batch_size=16, model_type="roberta", model_name="roberta-base"):
    model_args = NERArgs()
    model_args.train_batch_size = batch_size
    model_args.evaluate_during_training = True
    model_args.labels_list = labels
    model = NERModel(
        model_type, 
        model_name, 
        args=model_args
    )
    return model

def send_prediction_to_conll(prediction, filepath):
    with open(f"{outputs}/dev_predictions.conll", "w") as f:
        for sent in predictions:
            for token_dict in sent:
                for k, v in token_dict:
                    f.write(f"{k} {v}\n")

def document_results(experiment_name, outputs, results):
    with open(f"../experiments/{experiment_name}.json", "w") as f:
        details = dict()
        # TODO: continue building results like in experiments/experiment1. 
        # paths: outputs/model_args.json ; outputs/best_model/config.json ; 
        # outputs/best_model/eval_results.txt (convert this to dict):
        # eval_loss = 0.2069134594578492
        # f1_score = 0.5806451612903225
        # precision = 0.5765124555160143
        # recall = 0.5848375451263538

def main():
    dev_set, train_set, test_set = get_dataset_parts(dataset)
    clean_sentences = untag_dev_sentences(f"{dataset}/dev.txt")
    musician_labels=["B-MUS", "B-PER", "I-MUS", "I-PER", "O"]
    model = configure_model(labels=musician_labels)
    model.train_model(train, eval_data=dev_set)
    predictions, raw_outputs = model.predict(clean_sentences)
    
    send_prediction_to_conll(prediction, filepath)