In [1]:
import logging

import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
import re
import json
import wandb

import os
import glob


In [14]:
NAME = "musicians_with_hearst_manual_dev-1"
DATASET = '../data/musicians_dataset'
OUTPUTS = './outputs'
EXPERIMENTS = '../experiments'
LABELS = {"B": "B-MUS", "PB": "B-PER", "I": "I-MUS", "PI": "I-PER", "O": "O"}
DEV, TRAIN, TEST = "dev", "train_set_with_hearst", "test"

In [5]:
BATCH_SIZE = 64
EPOCHS = 3
ARGS = {
        "seed": 42,
        "labels_list": list(LABELS.values()),
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "train_batch_size": BATCH_SIZE,
        "eval_batch_size": BATCH_SIZE,
        "num_train_epochs": EPOCHS,
        "save_eval_checkpoints": False,
        "save_steps": -1,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 50,
        "evaluate_during_training_verbose": True,
        "fp16": False,
        "wandb_project": NAME,
        "learning_rate": 0.0003,
        "warmup_ratio": 0.1,
        "logging_steps": 1,
    }

In [6]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### data preparation

In [7]:
def ingest_data_to_df(filepath, label_map):
    tagged_data = []
    sentence_number = 0
    with open(filepath, 'r') as f:
        for line in f.readlines():
            for tagged_word in line.split():
                try:
                    word, tag = tagged_word.split("-[", 1)
                    old_tag = tag.split("]", 1)[0]
                    new_tag = label_map[old_tag]
                    tagged_data.append([sentence_number, word, new_tag])
                except Exception as e:
                    raise Exception((tagged_word, line), e)
            sentence_number += 1
    df = pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])
    return pd.DataFrame(tagged_data, columns=["sentence_id", "words", "labels"])

def get_dataset_parts(dataset_path, dev_name, train_name, test_name):
    devpath, trainpath, testpath = f"{dataset_path}/{dev_name}.txt", f"{dataset_path}/{train_name}.txt", f"{dataset_path}/{test_name}.txt"
    train = ingest_data_to_df(trainpath, LABELS)
    test = ingest_data_to_df(testpath, LABELS)
    dev = ingest_data_to_df(devpath, LABELS)
    return dev, train, test

def untag_dev_sentences(devpath):
    output_path = f"sentences_{devpath}"
    clean_sentences = []
    with open(devpath, "r") as fin:
        for line in fin.readlines():
            sentence = re.sub('\-\[[A-Z]*\]',  '', line)
            clean_sentences.append(sentence)
    return clean_sentences



### train and evaluate with simpletransformers

In [11]:
def configure_model(args, model_type="roberta", model_name="roberta-base"):
    model = NERModel(
        model_type, 
        model_name, 
        args=args
    )
    return model

def send_prediction_to_conll(predictions, experiments_path):
    with open(f"{experiments_path}/predictions/{NAME}.conll", "w") as f:
        for sent in predictions:
            for token_dict in sent:
                for k, v in token_dict.items():
                    f.write(f"{k} {v}\n")

def document_results(experiments_path, experiment_name, outputs):
    details = dict()
    with open(f"{experiments_path}/{experiment_name}.json", "w") as f:
        with open(f"{outputs}/best_model/model_args.json","r") as args:
            details["model_args"] = json.load(args)
        with open(f"{outputs}/best_model/config.json","r") as conf:
            details["best_model_conf"] = json.load(conf)
        with open(f"{outputs}/best_model/eval_results.txt","r") as res:
            details["eval_results"] = dict()
            for r in res.readlines():
                k, v = r.split (" = ")
                details["eval_results"][k] = v
        json.dump(details, f)

        
def main():
    dev_set, train_set, test_set = get_dataset_parts(DATASET, DEV, TRAIN, TEST)
    model = configure_model(args=ARGS)
    model.train_model(train_set, eval_data=dev_set)
    clean_sentences = untag_dev_sentences(f"{DATASET}/dev.txt")
    predictions, raw_outputs = model.predict(clean_sentences)
    document_results(EXPERIMENTS, NAME, OUTPUTS)
    send_prediction_to_conll(predictions, EXPERIMENTS)


In [15]:
main()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/33854 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,█▃▂▂▂▂▂▃▂▂▂▂▂▂▁▂▁▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_loss,█▅▄▅▄▄▄▃▃▂▃▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁
f1_score,▁▁▄▂▄▃▄▄▅▆▅▆▅▆▇▇▇▇▇▇████████
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▂▄▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
precision,▁▂▄▆▅▃▄▅▄▆▆▆▇▆▇▇█▇▇▇▇███████
recall,▅▄▅▁▅▆▆▆▇▇▆▇▄▇▇▇▇████▇█▇████
train_loss,█▆▇▆▄▅▄▅▃▅▃▂▅▂▄▄▅▂▁▂▃▃▁▂▄▁▂▂

0,1
Training loss,0.05199
eval_loss,0.064
f1_score,0.8595
global_step,1272.0
lr,0.0
precision,0.84915
recall,0.87011
train_loss,0.05199


Running Epoch 0 of 3:   0%|          | 0/529 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13676458100477853, 'precision': 0.5421245421245421, 'recall': 0.5323741007194245, 'f1_score': 0.5372050816696914}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.11927997320890427, 'precision': 0.5451388888888888, 'recall': 0.564748201438849, 'f1_score': 0.5547703180212015}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13688205182552338, 'precision': 0.45454545454545453, 'recall': 0.4856115107913669, 'f1_score': 0.46956521739130436}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1395512049396833, 'precision': 0.5664335664335665, 'recall': 0.5827338129496403, 'f1_score': 0.5744680851063829}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10300999631484349, 'precision': 0.5709342560553633, 'recall': 0.5935251798561151, 'f1_score': 0.582010582010582}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10513282318909963, 'precision': 0.6237942122186495, 'recall': 0.697841726618705, 'f1_score': 0.6587436332767401}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12516836573680243, 'precision': 0.6134751773049646, 'recall': 0.6223021582733813, 'f1_score': 0.6178571428571429}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15973110496997833, 'precision': 0.59765625, 'recall': 0.5503597122302158, 'f1_score': 0.5730337078651685}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.10225262741247813, 'precision': 0.6021505376344086, 'recall': 0.60431654676259, 'f1_score': 0.6032315978456014}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13875477264324823, 'precision': 0.5637583892617449, 'recall': 0.60431654676259, 'f1_score': 0.5833333333333334}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15257864445447922, 'precision': 0.5833333333333334, 'recall': 0.60431654676259, 'f1_score': 0.5936395759717316}


Running Epoch 1 of 3:   0%|          | 0/529 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.13399567703406015, 'precision': 0.6037735849056604, 'recall': 0.5755395683453237, 'f1_score': 0.5893186003683242}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12571342786153158, 'precision': 0.5714285714285714, 'recall': 0.60431654676259, 'f1_score': 0.5874125874125874}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12951169908046722, 'precision': 0.6006711409395973, 'recall': 0.6438848920863309, 'f1_score': 0.6215277777777778}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1325587679942449, 'precision': 0.5964285714285714, 'recall': 0.6007194244604317, 'f1_score': 0.5985663082437277}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1217391590277354, 'precision': 0.5884476534296029, 'recall': 0.5863309352517986, 'f1_score': 0.5873873873873874}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.12349506964286168, 'precision': 0.5880281690140845, 'recall': 0.6007194244604317, 'f1_score': 0.594306049822064}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1285926252603531, 'precision': 0.6571428571428571, 'recall': 0.6618705035971223, 'f1_score': 0.6594982078853047}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1511418546239535, 'precision': 0.5827338129496403, 'recall': 0.5827338129496403, 'f1_score': 0.5827338129496403}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.14726938803990683, 'precision': 0.5827338129496403, 'recall': 0.5827338129496403, 'f1_score': 0.5827338129496403}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15576126674811044, 'precision': 0.575, 'recall': 0.579136690647482, 'f1_score': 0.5770609318996415}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1547993173201879, 'precision': 0.592057761732852, 'recall': 0.5899280575539568, 'f1_score': 0.5909909909909911}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15652070691188177, 'precision': 0.5971731448763251, 'recall': 0.6079136690647482, 'f1_score': 0.6024955436720143}


Running Epoch 2 of 3:   0%|          | 0/529 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15416246404250464, 'precision': 0.5958904109589042, 'recall': 0.6258992805755396, 'f1_score': 0.6105263157894737}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.16908670713504156, 'precision': 0.5463576158940397, 'recall': 0.5935251798561151, 'f1_score': 0.5689655172413793}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.14243133614460626, 'precision': 0.5833333333333334, 'recall': 0.60431654676259, 'f1_score': 0.5936395759717316}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1826674242814382, 'precision': 0.5785953177257525, 'recall': 0.6223021582733813, 'f1_score': 0.5996533795493935}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15672158946593603, 'precision': 0.604982206405694, 'recall': 0.6115107913669064, 'f1_score': 0.6082289803220036}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.1559179723262787, 'precision': 0.5944055944055944, 'recall': 0.6115107913669064, 'f1_score': 0.6028368794326241}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.16554076969623566, 'precision': 0.5915492957746479, 'recall': 0.60431654676259, 'f1_score': 0.5978647686832741}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.17670203993717828, 'precision': 0.5886524822695035, 'recall': 0.5971223021582733, 'f1_score': 0.5928571428571429}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.17072394986947378, 'precision': 0.5859649122807018, 'recall': 0.6007194244604317, 'f1_score': 0.5932504440497336}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.17373688022295633, 'precision': 0.5865724381625441, 'recall': 0.5971223021582733, 'f1_score': 0.5918003565062387}
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.16490041961272559, 'precision': 0.5853658536585366, 'recall': 0.60431654676259, 'f1_score': 0.5946902654867257}
INFO:simpletransformers.ner.ner_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/150 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
dev_set, train_set, test_set = get_dataset_parts(DATASET)
train_set.head(5)

Unnamed: 0,sentence_id,words,labels
0,0,There,O
1,0,are,O
2,0,also,O
3,0,lesser,O
4,0,-,O


In [12]:
train_set[train_set['sentence_id'] == 0]

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0,sentence_id,words,labels
0,0,There,O
1,0,are,O
2,0,also,O
3,0,lesser,O
4,0,-,O
5,0,known,O
6,0,regional,O
7,0,forms,O
8,0,",",O
9,0,such,O


In [21]:
print(len(train_set["sentence_id"].unique()))
mus_dev = train_set[train_set['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
train_set[train_set['sentence_id'] == mus_dev[0]]

27760
4436


Unnamed: 0,sentence_id,words,labels
149,6,Amy,B-MUS
150,6,Dickson,I-MUS
151,6,(,O
152,6,born,O
153,6,1982,O
154,6,),O
155,6,is,O
156,6,an,O
157,6,Australian,O
158,6,classical,O


In [22]:
split_dev = ingest_data_to_df("../data/musicians_dataset/split_dev.txt", LABELS)
split_dev

Unnamed: 0,sentence_id,words,labels
0,0,He,O
1,0,was,O
2,0,a,O
3,0,leader,O
4,0,",",O
...,...,...,...
158028,5538,Province,O
158029,5538,of,O
158030,5538,Lower,O
158031,5538,Silesia,O


In [24]:
print(len(dev_set["sentence_id"].unique()))
mus_dev = dev_set[dev_set['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
dev_set[dev_set['sentence_id'] == mus_dev[0]]

150
50


Unnamed: 0,sentence_id,words,labels
1489,50,A,O
1490,50,large,O
1491,50,ensemble,O
1492,50,of,O
1493,50,trombonists,O
1494,50,would,O
1495,50,gather,O
1496,50,to,O
1497,50,play,O
1498,50,music,O


In [26]:
50/150

0.3333333333333333

In [23]:
print(len(split_dev["sentence_id"].unique()))
mus_dev = split_dev[split_dev['labels'] == 'B-MUS']["sentence_id"].unique()
print(len(mus_dev))
split_dev[split_dev['sentence_id'] == mus_dev[0]]

5539
908


Unnamed: 0,sentence_id,words,labels
32,1,Best,O
33,1,of,O
34,1,the,O
35,1,Soul,O
36,1,Years,O
37,1,is,O
38,1,a,O
39,1,2015,O
40,1,compilation,O
41,1,album,O


In [13]:

sp_dev

Unnamed: 0,sentence
352,"He-[O] was-[O] a-[O] leader-[O] ,-[O] along-[O..."
6189,Best-[O] of-[O] the-[O] Soul-[O] Years-[O] is-...
24182,"However-[O] ,-[O] Leah-[PB] has-[O] kept-[O] s..."
19702,Robert-[B] James-[I] Smith-[I] (-[O] born-[O] ...
11918,In-[O] his-[O] early-[O] years-[O] in-[O] the-...
...,...
10713,She-[O] was-[O] the-[O] mother-[O] of-[O] the-...
16730,Temperley-[O] suffered-[O] relegation-[O] to-[...
26834,Highway-[O] 308-[O] begins-[O] at-[O] Highway-...
16425,"Founded-[O] in-[O] 1989-[O] ,-[O] by-[O] Frank..."


In [24]:
with open('../data/musicians_dataset/split_train.txt', "w") as ft:
    for i, row in sp_train.iterrows():
        ft.write(row['sentence']+"\n")
with open('../data/musicians_dataset/split_dev.txt', "w") as fd:
    for i, row in sp_dev.iterrows():
        fd.write(row['sentence']+"\n")
# sp_train['sentence'].to_csv('../data/musicians_dataset/split_train.txt', index=False, sep='\t', header=False)