<a href="https://colab.research.google.com/github/anitanadvikova/huggingartists/blob/main/huggingartists_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install pathlib
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pathlib import Path
import pathlib
import os
import random
from IPython.utils import io
import wandb
import json
from IPython.display import HTML

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
datasets = load_dataset("huggingartists/eminem")
train_percentage = 0.9
validation_percentage = 0.07
test_percentage = 0.03
train, validation, test = np.split(datasets['train']['text'], [int(len(datasets['train']['text'])*train_percentage), int(len(datasets['train']['text'])*(train_percentage + validation_percentage))])
datasets = DatasetDict(
        {
             'train': Dataset.from_dict({'text': list(train)}),
             'validation': Dataset.from_dict({'text': list(validation)}),
             'test': Dataset.from_dict({'text': list(test)})
        }
)



  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"],truncation=True , max_length=1024)

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
user, namespace = 'huggingartists-app', 'huggingartists'
model_name='Eminem'
try:
  tokenizer = AutoTokenizer.from_pretrained(f"{namespace}/{model_name}")
  model = AutoModelForCausalLM.from_pretrained(f"{namespace}/{model_name}", cache_dir=pathlib.Path('cache').resolve())
except:
  tokenizer = AutoTokenizer.from_pretrained("gpt2")
  model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=pathlib.Path('cache').resolve())

# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

#block_size = tokenizer.model_max_length
block_size = int(tokenizer.model_max_length / 4)

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

Map:   0%|          | 0/1156 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/1156 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [19]:
num_train_epochs=10
trainer_state_path = f'{model_name}/trainer_state.json'
if os.path.isfile(trainer_state_path):
  f = open (trainer_state_path, "r")
  trainer_state = json.loads(f.read()) 
  f.close()
  epoch = trainer_state['epoch']
  num_train_epochs += epoch

seed_data = random.randint(0,2**32-1)
# Set-up Trainer
os.environ['WANDB_WATCH'] = 'false'  # used in Trainer
training_args = TrainingArguments(
    f"output/{model_name}",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    learning_rate=1.372e-4,
    weight_decay=0.01,
    num_train_epochs=num_train_epochs,
    save_total_limit=10,
    save_strategy='epoch',
    save_steps=1,
    report_to=None,
    seed=seed_data,
    logging_steps=5,
    do_eval=True,
    eval_steps=1,
    load_best_model_at_end=True
    # disable_tqdm=True
    # load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"]
)

from transformers import get_cosine_schedule_with_warmup
train_dataloader = trainer.get_train_dataloader()
num_train_steps = len(train_dataloader)
trainer.create_optimizer_and_scheduler(num_train_steps)
trainer.lr_scheduler = get_cosine_schedule_with_warmup(
      trainer.optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps
)

trainer.model.config.task_specific_params['text-generation'] = {
                    'do_sample': True,
                    'min_length': 100,
                    'max_length': 200,
                    'temperature': 1.,
                    'top_p': 0.95,
                    # 'prefix': '<|endoftext|>',
                    }

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
from torch import __version__ as torch_version
from transformers import __version__ as transformers_version

with io.capture_output() as captured:
  wandb.login(key='cd33331f97be3145253704fc38efef090ffe8151') # huggingartists service key

WANDB_PROJECT = 'huggingartists'
WANDB_NOTES = "Github repo: https://github.com/AlekseyKorshuk/huggingartists"
WANDB_ENTITY = 'huggingartists'
VERSION = 1.0
os.environ['WANDB_NOTEBOOK_NAME'] = 'huggingartists-demo.ipynb'  # used in wandb cli
model_card_settings = {}

def setup_wandb():
  global model_card_settings
  run = wandb.init(name=f"{model_name}-preprocess",
          job_type='preprocess',
          config={'huggingartists version':VERSION,
              'handle':model_name,
              'seed data':seed_data},
          project = WANDB_PROJECT,
          entity = WANDB_ENTITY,
          notes = WANDB_NOTES,
          reinit=True)
    
  # log raw tweets as input
  global metadata
  metadata={'handle':model_name,
        'huggingartists version': VERSION}
  artifact_input = wandb.Artifact(
      f"lyrics-{model_name}",
      type='raw-dataset',
      description=f"Lyrics from {model_name} downloaded with Genius",                            
      metadata=metadata)
  with artifact_input.new_file('lyrics.txt') as f:
    json.dump(datasets['train'].to_dict(), f, indent=0, ensure_ascii=False)
  run.use_artifact(artifact_input)
  # log dataset as output                        
  metadata={'handle':model_name,
        'seed data': seed_data,
        'epochs': num_train_epochs,
        'huggingartists version': VERSION}
  global artifact_dataset
  artifact_dataset = wandb.Artifact(
    f"dataset-{model_name}",
    type='train-dataset',
    description=f"Dataset created from lyrics of {model_name}",
    metadata=metadata)
  with open(f"data_{model_name}_train.txt", 'w', encoding='utf-8') as f:
    f.write('\n\n\n'.join(datasets['train']['text']))
  artifact_dataset.add_file(f"data_{model_name}_train.txt")
  run.log_artifact(artifact_dataset)

  # keep track of url
  wandb_url = wandb.run.get_url()
  model_card_settings['WANDB_PREPROCESS'] = str(wandb_url)

  combined_dict = {**model.config.to_dict(), **training_args.to_sanitized_dict()}
  run = wandb.init(name=f"{model_name}-train",
          job_type='train',
          config={'huggingartists version':VERSION,
              'pytorch version': torch_version,
              'transformers version': transformers_version,
              'handle':model_name,
              **combined_dict},
          project = WANDB_PROJECT,
          entity = WANDB_ENTITY,
          notes = WANDB_NOTES,
          reinit=True)


  # keep track of url
  wandb_url = wandb.run.get_url()
  model_card_settings['WANDB_TRAIN'] = wandb_url



  # log dataset and pretrained model
  artifact_dataset.wait()
  run.use_artifact(artifact_dataset)
  artifact_gpt2 = wandb.Artifact(
    f'gpt2',
    type='pretrained-model',
    description=f'Pretrained model from OpenAI downloaded from 🤗 Transformers: https://huggingface.co/gpt2',
    metadata={'huggingartists version': VERSION})
  artifact_gpt2.add_dir('cache', name='gpt2')
  run.use_artifact(artifact_gpt2)
  return run

with io.capture_output() as captured:
  run = setup_wandb()

In [21]:
import torch
torch.cuda.empty_cache()
if os.path.isfile(trainer_state_path):
  try:
    data = trainer.train(resume_from_checkpoint=model_name)
  except:
    data = trainer.train()
else:
  data = trainer.train()
# print(data)


***** Running training *****
  Num examples = 3419
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4280
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,0.3774,1.025927
2,0.4758,1.070495
3,0.4244,1.088274
4,0.3455,1.117122
5,0.3029,1.15446
6,0.3977,1.218655
7,0.2733,1.213767
8,0.3167,1.286824
9,0.3298,1.282257
10,0.5134,1.35929


***** Running Evaluation *****
  Num examples = 175
  Batch size = 8
Saving model checkpoint to output/Eminem/checkpoint-428
Configuration saved in output/Eminem/checkpoint-428/config.json
Configuration saved in output/Eminem/checkpoint-428/generation_config.json
Model weights saved in output/Eminem/checkpoint-428/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 175
  Batch size = 8
Saving model checkpoint to output/Eminem/checkpoint-856
Configuration saved in output/Eminem/checkpoint-856/config.json
Configuration saved in output/Eminem/checkpoint-856/generation_config.json
Model weights saved in output/Eminem/checkpoint-856/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 175
  Batch size = 8
Saving model checkpoint to output/Eminem/checkpoint-1284
Configuration saved in output/Eminem/checkpoint-1284/config.json
Configuration saved in output/Eminem/checkpoint-1284/generation_config.json
Model weights saved in output/Eminem/checkpoint-1284/pytorch_mode

In [22]:
try:
  with open('data.txt') as json_file:
      evaluation = json.load(json_file)
  eval_loss = evaluation['eval_loss']
except:
  eval_loss = 9999999
  
evaluation = trainer.evaluate()


***** Running Evaluation *****
  Num examples = 175
  Batch size = 8


In [10]:
if evaluation['eval_loss'] < eval_loss:
  save_model = True
  with open(f'{model_name}/evaluation.txt', 'w') as outfile:
    json.dump(evaluation, outfile)
else:
  save_model = False


FileNotFoundError: ignored

In [23]:
start = "I am" #@param {type:"string"}
#@markdown Amount of generated texts:
num_sequences =  10 #@param {type:"integer"}
#@markdown Generation settings:
min_length =  100 #@param {type:"integer"}
max_length =   160#@param {type:"integer"}
temperature = 1 #@param {type:"slider", min:0, max:3, step:0.01}
top_p = 0.95 #@param {type:"slider", min:0, max:1, step:0.01}
top_k = 50 #@param {type:"integer"}
repetition_penalty =  1.0#@param {type:"number"}

encoded_prompt = tokenizer(start, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)
# prediction
output_sequences = trainer.model.generate(
                        input_ids=encoded_prompt,
                        max_length=max_length,
                        min_length=min_length,
                        temperature=float(temperature),
                        top_p=float(top_p),
                        top_k=int(top_k),
                        do_sample=True,
                        repetition_penalty=repetition_penalty,
                        num_return_sequences=num_sequences
                        )

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [24]:
def post_process(output_sequences):
    predictions = []
    generated_sequences = []

    max_repeat = 2

    # decode prediction
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
        generated_sequences.append(text.strip())
                    
    for i, g in enumerate(generated_sequences):
        res = str(g).replace('\n\n\n', '\n').replace('\n\n', '\n')
        lines = res.split('\n')
        # print(lines)
        i = max_repeat
        while i != len(lines):
          remove_count = 0
          for index in range(0, max_repeat):
            # print(i - index - 1, i - index)
            if lines[i - index - 1] == lines[i - index]:
              remove_count += 1
          if remove_count == max_repeat:
            lines.pop(i)
            i -= 1
          else:
            i += 1
        predictions.append('\n'.join(lines))

    return predictions
def stylize():
    "Handle dark mode"
    display(HTML('''
    <style>
    :root {
        --table_bg: #EBF8FF;
    }
    html[theme=dark] {
        --colab-primary-text-color: #d5d5d5;
        --table_bg: #2A4365;
    }
    .jupyter-widgets {
        color: var(--colab-primary-text-color);
    }
    table {
        border-collapse: collapse !important;
    }
    td {
        text-align:left !important;
        border: solid var(--table_bg) !important;
        border-width: 1px 0 !important;
        padding: 6px !important;
    }
    tr:nth-child(even) {
        background-color: var(--table_bg) !important;
    }
    .table_odd {
        background-color: var(--table_bg) !important;
        margin: 0 !important;
    }
    .table_even {
        border: solid var(--table_bg) !important;
        border-width: 1px 0 !important;
        margin: 0 !important;
    }
    .jupyter-widgets {
        margin: 6px;
    }
    .widget-html-content {
        font-size: var(--colab-chrome-font-size) !important;
        line-height: 1.24 !important;
    }
    </style>'''))



def get_table(table_data):
  html = ("</head>\r\n"
    "<body>\r\n\r\n"
    "<h2></h2>"
    "\r\n\r\n"
    "<table>\r\n"
    "    <colgroup>\r\n"
    "       <col span=\"1"
    "\" style=\"width: 10"
    "%;\">\r\n"
    "       <col span=\"1"
    "\" style=\"width: 10"
    "0%;\">\r\n"
    "    </colgroup>\r\n"
    f"{' '.join(table_data)}"
    "</table>\r\n\r\n"
    "</body>\r\n"
    "</html>")
  
  return html

In [25]:
# Post-processing
predictions = post_process(output_sequences)

wandb.log({'examples': wandb.Table(data=[(start, result) for result in predictions], columns=['Input', 'Prediction'])})
stylize()
table_data = []
for result in predictions:
  table_data.append('<tr><td>' + '</td><td>' + result.replace("\n", "<br>") + '</td></tr>')
display(HTML(get_table(table_data)))

0,1
,"I am on top of this thing, I got the top again Standin on top of this thing, Im a champion Im a champion, yeah, Im a champion History is like a quarter, you advance to the next King of the motherfuckin City, came from the underground Down to the corner, rapping hard as fuck Slammin the beat, and stop hammermin it, fuck that Yo, I got something for your ass, keep talking shit Imma give it to you! Fuck that!EmbedTil the World Collapse Lyrics Til the walls are going up, in smoke with all our memories It’s morning, you wake, the sun is up We lay in the wake of destruction Hush"
,"I am But I got wings on my ass And I will fly off of the top of convertibles Aint gon keep a motherfucker hurt But I am strong to the finish with me Valium spinach, Illa! I told you I dont get down, but still I fuck and I crash Like a crash landin in the middle of a cypher Riding down your potholes on a track But when Im drunk as hell, Ill be back at it With a fifth in my juice and two spirits Im attackin the track Killin anybody; even the nigga that came with us Ima wet your set like a setter And I dont even get charged in connection with battery We get all doped up"
,"I am what you used to be If you was listenin to 50, two minutes, your earsll be Right on time, minus the music in the room People steppin over people just to rush to the set Just to get to see an MC who breathes so freely Ease over these beats and be so breezy Jesus, how can shit be so easy? How can one Chandra be so Levy? Turn on these beats, MCs dont see me Believe me, BET and MTV Are gonna grieve when we leave, dog, fo sheezy Cant leave rap alone, the game needs me Til we grow beards, get weird and disappear Into the mountains, nothin but clowns down here"
,"I am I dont wanna go back They keep telling me the hood dont miss me and every lil nigga in the hood wanna diss me And I dont care, I dont care, I dont care.... These hoes used to act all prissy, diss me, now they all kissy kissy And I dont care, I dont care, I dont care.... I got class so you cant dismiss me and all you can do is just sit and be pissy! And I dont care, I dont care, I dont care, I dont care, I dont care, I dont care, I dont care, I dont care, I dont care, I dont caarrre, I dont careeee!!! Beat makes a strange turn, almost FRIGHT"
,"I am not Jasmine, I am Aladdin So far ahead, these bums is laggin See me in that new thing, bums is gaggin Im startin to feel like a dungeon dragon Rah, rah, like a dungeon dragon Im startin to feel like a dungeon dragon Look at my show footage, how these girls be spazzin So fuck I look like gettin back to a has-been? Yeah, I said it, has-been Hang it up, flatscreen Plasma Hey Nicki, hey Nicki, asthma I got the pumps, it aint got medicine I got bars, sentencin Im a bad bitch, Im a cunt And Ill kick that ho, punt"
,"I am the most hated, though granny I do Motherfuckers envy, man, they dont know me I do this shit for the love I dont want to not do this shit for the love not Dont even try, man, I dont even wanna try I stand in this booth and it hurts Just gonna stand there and hear me cry Thats alright because I love the way you lie I love the way you lie Now theres gravel in our voices Glass is shattered from the fight In this tug of war you always win Even when Im right Cause you feed me fables from your head With violent words and empty threats And its sick that all these battles Are what keeps me satisfied Just gonna stand there and watch me"
,"I am phenomenal With every ounce of my blood With every breath in my lungs Wont stop until Im phe-no-menal I am phenomenal However long that it takes Ill go to whatever lengths Its gonna make me a monster though I am phenomenal But Ill never say, ‘Oh, it’s impossible’ Cause Im born to be phenomenal Oh, step into the unknown and find yourself You’re floating freely, no emotion Got a fuckin’ mouth with no shut-off valve Cant even cut off power to it, but its what allowed Me to come up out from under the fuckin ground Cause I worked my butt off now Its a subject that I dont know how to shut up"
,"I am not Jasmine, I am Aladdin So far ahead, these bums is laggin See me in that new thing, bums is gaggin Im startin to feel like a dungeon dragon Rah, rah, like a dungeon dragon Im startin to feel like a dungeon dragon Look at my show footage, how these girls be spazzin So fuck I look like gettin back to a has-been? Yeah, I said it, has-been Hang it up, flatscreen Plasma Hey Nicki, hey Nicki, asthma I got the pumps, it aint got medicine I got bars, sentencin Im a bad bitch, Im a cunt And Ill kick that ho, punt"
,"I am happy, I am happy And unhurried, I am anxious To release anger, release anger Control stress, control stress All three of our six main girls Step in the door, one of us inside Snatch the.9 out her purse Two nights later, another one dies And thats pretty much the gist of it Papers, magazines, lyrics Bitch, be good or be good at it I do what I do, I dont care what you do Cause you can be just like me Picture me being crack, out of town, trips on the train, What! Its the last package I ever seen, ass out at the store Your little sister calling you stupid, reason why? Her and your mama"
,"I am like a walking, talking Ouija board Speaking in tongues, Ive never spoke this speech before Hhem-delle-la, ennich-me-noughh-mi-niche-mick-norrf Have you ever experienced spirits in lyrics when you hear em Til you scared to stare into any mirrors when you near em? Well if so, get ready to fight mirrors because theyre just kids As soon as you hear the lyrics to your hits You can look into em and see yourself as if youre just a kid Now every time you rap a lyric People are lookin for a way out of this world And its gettin less easy Now it seems like everybody is just startin to take a"
