In [None]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import re
import json
import wandb

import os
import glob
import numpy as np

In [None]:
WANDB_NAME = "sents_with_no_ents"
EXPERIMENT_NAME = f"{WANDB_NAME}-manual"

DATASET = '../data/musicians_dataset'
OUTPUTS = './outputs'
EXPERIMENTS = '../experiments'
LABELS = {"B": "B-MUS", "PB": "B-PER", "I": "I-MUS", "PI": "I-PER", "O": "O"}
DEV, TRAIN, TEST = "dev_converted", "split_train_2", "split_test_2"

In [None]:
BATCH_SIZE = 64
EPOCHS = 3
ARGS = {
        "seed": 42,
        "labels_list": list(LABELS.values()),
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "train_batch_size": BATCH_SIZE,
        "eval_batch_size": BATCH_SIZE,
        "num_train_epochs": EPOCHS,
        "save_eval_checkpoints": False,
        "save_steps": -1,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 50,
        "evaluate_during_training_verbose": True,
        "fp16": False,
        "wandb_project": WANDB_NAME,
        "learning_rate": 0.0003,
        "warmup_ratio": 0.1,
        "logging_steps": 1,
    }

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### data preparation

In [None]:
def ingest_data_to_df(filepath, label_map):
    tagged_data = []
    sentence_number = 0
    with open(filepath, 'r') as f:
        for line in f.readlines():
            for tagged_word in line.split():
                try:
                    word, tag = tagged_word.split("-[", 1)
                    old_tag = tag.split("]", 1)[0]
                    new_tag = label_map[old_tag]
                    tagged_data.append([sentence_number, word, new_tag])
                except Exception as e:
                    raise Exception((tagged_word, line), e)
            sentence_number += 1
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def get_dataset_parts(dataset_path, dev_name, train_name, test_name):
    devpath, trainpath, testpath = f"{dataset_path}/{dev_name}.txt", f"{dataset_path}/{train_name}.txt", f"{dataset_path}/{test_name}.txt"
    train = ingest_data_to_df(trainpath, LABELS)
    test = ingest_data_to_df(testpath, LABELS)
    dev = ingest_data_to_df(devpath, LABELS)
    return dev, train, test

def untag_dev_sentences(devpath):
    output_path = f"sentences_{devpath}"
    clean_sentences = []
    with open(devpath, "r") as fin:
        for line in fin.readlines():
            sentence = re.sub('\-\[[A-Z]*\]',  '', line)
            clean_sentences.append(sentence)
    return clean_sentences


# new data handling:
def import_jsons_to_df(dataset_path, filename):
    fp = f"{dataset_path}/{filename}.jsonl"
    tagged_data = []
    with jsonlines.open(fp, 'r') as f:
        for line in f:
            for token in line["sent_items"]:
                tagged_data.append([line["id"], token[0], token[1]])
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def untagged_sentences(fp): 
    clean = []
    with jsonlines.open(fp, "r") as f:
        for line in f:
            words = [w[0] for w in line["sent_items"]]
            clean.append(" ".join(words))
    return clean

### train and evaluate with simpletransformers

In [None]:
def configure_model(args, model_type="roberta", model_name="roberta-base"):
    model = NERModel(
        model_type, 
        model_name, 
        args=args
    )
    return model

def send_prediction_to_conll(predictions, experiments_path):
    with open(f"{experiments_path}/predictions/{EXPERIMENT_NAME}.conll", "w") as f:
        for sent in predictions:
            for token_dict in sent:
                for k, v in token_dict.items():
                    f.write(f"{k} {v}\n")

def document_results(experiments_path, experiment_name, outputs):
    details = dict()
    with open(f"{experiments_path}/{experiment_name}.json", "w") as f:
        with open(f"{outputs}/best_model/model_args.json","r") as args:
            details["model_args"] = json.load(args)
        with open(f"{outputs}/best_model/config.json","r") as conf:
            details["best_model_conf"] = json.load(conf)
        with open(f"{outputs}/best_model/eval_results.txt","r") as res:
            details["eval_results"] = dict()
            for r in res.readlines():
                k, v = r.split (" = ")
                details["eval_results"][k] = v
        json.dump(details, f)

        
def main():
    dev_set = import_jsons_to_df(DATASET, DEV)
    train_set = import_jsons_to_df(DATASET, TRAIN)
    model = configure_model(args=ARGS)
    model.train_model(train_set, eval_data=dev_set)
    clean_sentences = untag_dev_sentences(f"{DATASET}/{DEV}.txt")
    predictions, raw_outputs = model.predict(clean_sentences)
    document_results(EXPERIMENTS, EXPERIMENT_NAME, OUTPUTS)
    send_prediction_to_conll(predictions, EXPERIMENTS)


In [None]:
main()