In [1]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import re
import json
import wandb

import os
import glob
import numpy as np

In [7]:
NAME = "musicians_with_hearst_split-1"
DATASET = '../data/musicians_dataset'
OUTPUTS = './outputs'
EXPERIMENTS = '../experiments'
LABELS = {"B": "B-MUS", "PB": "B-PER", "I": "I-MUS", "PI": "I-PER", "O": "O"}
DEV, TRAIN, TEST = "dev_set_with_hearst", "train_set_with_hearst", "test"

In [4]:
BATCH_SIZE = 64
EPOCHS = 3
ARGS = {
        "seed": 42,
        "labels_list": list(LABELS.values()),
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "train_batch_size": BATCH_SIZE,
        "eval_batch_size": BATCH_SIZE,
        "num_train_epochs": EPOCHS,
        "save_eval_checkpoints": False,
        "save_steps": -1,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 50,
        "evaluate_during_training_verbose": True,
        "fp16": False,
        "wandb_project": NAME,
        "learning_rate": 0.0003,
        "warmup_ratio": 0.1,
        "logging_steps": 1,
    }

In [5]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### data preparation

In [6]:
def ingest_data_to_df(filepath, label_map):
    tagged_data = []
    sentence_number = 0
    with open(filepath, 'r') as f:
        for line in f.readlines():
            for tagged_word in line.split():
                try:
                    word, tag = tagged_word.split("-[", 1)
                    old_tag = tag.split("]", 1)[0]
                    new_tag = label_map[old_tag]
                    tagged_data.append([sentence_number, word, new_tag])
                except Exception as e:
                    raise Exception((tagged_word, line), e)
            sentence_number += 1
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def get_dataset_parts(dataset_path, dev_name, train_name, test_name):
    devpath, trainpath, testpath = f"{dataset_path}/{dev_name}.txt", f"{dataset_path}/{train_name}.txt", f"{dataset_path}/{test_name}.txt"
    train = ingest_data_to_df(trainpath, LABELS)
    test = ingest_data_to_df(testpath, LABELS)
    dev = ingest_data_to_df(devpath, LABELS)
    return dev, train, test

def untag_dev_sentences(devpath):
    output_path = f"sentences_{devpath}"
    clean_sentences = []
    with open(devpath, "r") as fin:
        for line in fin.readlines():
            sentence = re.sub('\-\[[A-Z]*\]',  '', line)
            clean_sentences.append(sentence)
    return clean_sentences



### train and evaluate with simpletransformers

In [8]:
def configure_model(args, model_type="roberta", model_name="roberta-base"):
    model = NERModel(
        model_type, 
        model_name, 
        args=args
    )
    return model

def send_prediction_to_conll(predictions, experiments_path):
    with open(f"{experiments_path}/predictions/{NAME}.conll", "w") as f:
        for sent in predictions:
            for token_dict in sent:
                for k, v in token_dict.items():
                    f.write(f"{k} {v}\n")

def document_results(experiments_path, experiment_name, outputs):
    details = dict()
    with open(f"{experiments_path}/{experiment_name}.json", "w") as f:
        with open(f"{outputs}/best_model/model_args.json","r") as args:
            details["model_args"] = json.load(args)
        with open(f"{outputs}/best_model/config.json","r") as conf:
            details["best_model_conf"] = json.load(conf)
        with open(f"{outputs}/best_model/eval_results.txt","r") as res:
            details["eval_results"] = dict()
            for r in res.readlines():
                k, v = r.split (" = ")
                details["eval_results"][k] = v
        json.dump(details, f)

        
def main():
    dev_set, train_set, test_set = get_dataset_parts(DATASET, DEV, TRAIN, TEST)
    model = configure_model(args=ARGS)
    model.train_model(train_set, eval_data=dev_set)
    clean_sentences = untag_dev_sentences(f"{DATASET}/{DEV}.txt")
    predictions, raw_outputs = model.predict(clean_sentences)
    document_results(EXPERIMENTS, NAME, OUTPUTS)
    send_prediction_to_conll(predictions, EXPERIMENTS)


In [9]:
main()

FileNotFoundError: [Errno 2] No such file or directory: '../data/musicians_dataset/dev_set_with_hearst.txt'

In [11]:
dev_set, train_set, test_set = get_dataset_parts(DATASET)
train_set.head(5)

Unnamed: 0,sentence_id,words,labels
0,0,There,O
1,0,are,O
2,0,also,O
3,0,lesser,O
4,0,-,O


In [12]:
train_set[train_set['sentence_id'] == 0]

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0,sentence_id,words,labels
0,0,There,O
1,0,are,O
2,0,also,O
3,0,lesser,O
4,0,-,O
5,0,known,O
6,0,regional,O
7,0,forms,O
8,0,",",O
9,0,such,O


In [21]:
print(len(train_set["sentence_id"].unique()))
mus_dev = train_set[train_set['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
train_set[train_set['sentence_id'] == mus_dev[0]]

27760
4436


Unnamed: 0,sentence_id,words,labels
149,6,Amy,B-MUS
150,6,Dickson,I-MUS
151,6,(,O
152,6,born,O
153,6,1982,O
154,6,),O
155,6,is,O
156,6,an,O
157,6,Australian,O
158,6,classical,O


In [22]:
split_dev = ingest_data_to_df("../data/musicians_dataset/split_dev.txt", LABELS)
split_dev

Unnamed: 0,sentence_id,words,labels
0,0,He,O
1,0,was,O
2,0,a,O
3,0,leader,O
4,0,",",O
...,...,...,...
158028,5538,Province,O
158029,5538,of,O
158030,5538,Lower,O
158031,5538,Silesia,O


In [24]:
print(len(dev_set["sentence_id"].unique()))
mus_dev = dev_set[dev_set['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
dev_set[dev_set['sentence_id'] == mus_dev[0]]

150
50


Unnamed: 0,sentence_id,words,labels
1489,50,A,O
1490,50,large,O
1491,50,ensemble,O
1492,50,of,O
1493,50,trombonists,O
1494,50,would,O
1495,50,gather,O
1496,50,to,O
1497,50,play,O
1498,50,music,O


In [26]:
50/150

0.3333333333333333

In [23]:
print(len(split_dev["sentence_id"].unique()))
mus_dev = split_dev[split_dev['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
split_dev[split_dev['sentence_id'] == mus_dev[0]]

5539
908


Unnamed: 0,sentence_id,words,labels
32,1,Best,O
33,1,of,O
34,1,the,O
35,1,Soul,O
36,1,Years,O
37,1,is,O
38,1,a,O
39,1,2015,O
40,1,compilation,O
41,1,album,O


In [13]:

sp_dev

Unnamed: 0,sentence
352,"He-[O] was-[O] a-[O] leader-[O] ,-[O] along-[O..."
6189,Best-[O] of-[O] the-[O] Soul-[O] Years-[O] is-...
24182,"However-[O] ,-[O] Leah-[PB] has-[O] kept-[O] s..."
19702,Robert-[B] James-[I] Smith-[I] (-[O] born-[O] ...
11918,In-[O] his-[O] early-[O] years-[O] in-[O] the-...
...,...
10713,She-[O] was-[O] the-[O] mother-[O] of-[O] the-...
16730,Temperley-[O] suffered-[O] relegation-[O] to-[...
26834,Highway-[O] 308-[O] begins-[O] at-[O] Highway-...
16425,"Founded-[O] in-[O] 1989-[O] ,-[O] by-[O] Frank..."


In [24]:
with open('../data/musicians_dataset/split_train.txt', "w") as ft:
    for i, row in sp_train.iterrows():
        ft.write(row['sentence']+"\n")
with open('../data/musicians_dataset/split_dev.txt', "w") as fd:
    for i, row in sp_dev.iterrows():
        fd.write(row['sentence']+"\n")
# sp_train['sentence'].to_csv('../data/musicians_dataset/split_train.txt', index=False, sep='\t', header=False)