In [1]:
import pandas as pd
import numpy as np

from model.utils import clean
from model.dataset import SongDataset
from model.trainer import SongTrainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
model_name = "searle-j/kote_for_easygoing_people"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
df = pd.read_csv('./data/Lyrics_top100.csv')

In [4]:
fake_labels = [list(set(np.random.choice(44, 10))) for _ in range(100)]

In [5]:
fake_labels

[[32, 2, 36, 42, 16, 18, 22, 26, 28, 29],
 [35, 36, 37, 38, 10, 13, 22, 28, 31],
 [32, 3, 5, 7, 10, 13, 25, 27, 29],
 [3, 4, 36, 40, 41, 12, 17, 22, 26],
 [0, 33, 3, 35, 39, 8, 40, 42, 30],
 [0, 33, 8, 41, 15, 17, 18, 21, 26, 28],
 [32, 1, 9, 42, 12, 15, 16],
 [0, 33, 5, 6, 11, 13, 27, 28, 30, 31],
 [0, 32, 4, 37, 43, 18, 20, 23, 28],
 [3, 37, 5, 43, 13, 16, 18, 25, 27],
 [33, 37, 38, 10, 42, 17, 18, 22, 25, 29],
 [33, 2, 34, 5, 43, 18, 19, 21, 25],
 [32, 6, 39, 9, 42, 11, 22, 25, 28, 30],
 [0, 33, 7, 9, 43, 14, 17, 22, 23, 30],
 [0, 3, 10, 18, 23, 24, 26, 28, 29],
 [34, 38, 6, 9, 10, 23, 25, 27, 31],
 [33, 2, 3, 6, 8, 40, 10, 11, 26, 29],
 [1, 2, 34, 10, 11, 12, 14, 22, 26, 30],
 [1, 4, 6, 39, 41, 21, 25, 27],
 [1, 36, 4, 5, 10, 11, 12, 18, 22],
 [32, 2, 41, 10, 42, 9, 22, 26, 30, 31],
 [32, 0, 4, 40, 41, 13, 26, 28, 30, 31],
 [0, 32, 2, 35, 36, 3, 17, 22, 27],
 [0, 3, 4, 5, 38, 40, 14, 15, 21, 26],
 [32, 33, 1, 4, 38, 7, 40, 12, 26, 27],
 [2, 6, 7, 40, 13, 21, 24, 26, 29],
 [0, 2, 36

In [6]:
df['labels'] = fake_labels

In [7]:
texts = df['lyric']
labels = df['labels']

In [8]:
cleaned_text = [clean(text) for text in texts]

In [9]:
dataset_ = SongDataset(cleaned_text, labels, tokenizer)

In [10]:
from model.arguments import *
from transformers import HfArgumentParser
from transformers import TrainingArguments

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_json_file('./args.json')

In [11]:
model_args

ModelArguments(model_name_or_path='searle-j/kote_for_easygoing_people', config_name='searle-j/kote_for_easygoing_people', tokenizer_name='searle-j/kote_for_easygoing_people')

In [12]:
trainer = SongTrainer(model=model,
                      args=training_args,
                      train_dataset=dataset_)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 650
  batch[k] = torch.tensor([f[k] for f in features])


Step,Training Loss
500,0.4311


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=650, training_loss=0.4116526500995343, metrics={'train_runtime': 180.1274, 'train_samples_per_second': 27.758, 'train_steps_per_second': 3.609, 'total_flos': 1316051374080000.0, 'train_loss': 0.4116526500995343, 'epoch': 50.0})