<!--<badge>--><a href="https://colab.research.google.com/github/huggingface/workshops/blob/main/nlp-zurich/02-text-classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a><!--</badge>-->

## Install dependencies

In [7]:
! pip install datasets transformers sentencepiece
!apt install git-lfs
!pip install sklearn


Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


##Fetch and prepare training dataset

In [8]:
%%bash
git clone https://github.com/danielhorizon/lyrics-genreation/
mv lyrics-genreation/genius_data .
rm -r lyrics-genreation

Cloning into 'lyrics-genreation'...
Checking out files:  97% (41/42)   Checking out files: 100% (42/42)   Checking out files: 100% (42/42), done.
mv: cannot move 'lyrics-genreation/genius_data' to './genius_data': Directory not empty


In [9]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import re
max_length = 100
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'
test_frac = 0.15

def read_genre(genre):
  return pd.read_csv('genius_data/%s.csv'%genre)[['artist', 'genre', 'title', 'lyrics']]

def build_text_files(df, dest_path):
    text = ''
    for i, row in df.iterrows():
      lyrics = row['lyrics']
      if str(lyrics) == 'nan':
        continue
      text += lyrics + '. '
    text = text.replace('\n\n', '.\n')
    text = re.sub(r'\s', ' ', text)
    text = re.sub(r'\.+', '.', text)    
    text = re.sub(r' +', ' ', text)

    with open(dest_path, 'w') as f:
      f.write(text)
    print(dest_path + ' length: ' + str(len(df)))
    print(str(len(text)) + ' elements')

genres = ['pop','jazz','folk','soul']

df = pd.concat([read_genre(x) for x in genres]).sample(frac=1) # mix different genres

train, test = train_test_split(df, test_size=test_frac)
build_text_files(train, train_path)
build_text_files(test, test_path)

train_dataset.txt length: 19233
25433214 elements
test_dataset.txt length: 3395
4479939 elements


In [10]:
from transformers import TextDataset, DataCollatorForLanguageModeling, AutoTokenizer

model_name = "distilgpt2"
model_name_ft = "distilgpt2-finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

!rm cached*
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_at

##Trainer

In [13]:
from huggingface_hub import notebook_login
notebook_login()

ValueError: ignored

In [14]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
batch_size = 32
model = AutoModelWithLMHead.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./" + model_name_ft, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    push_to_hub=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_p

Download file pytorch_model.bin:   0%|          | 3.48k/319M [00:00<?, ?B/s]

Download file runs/Jan13_18-32-01_a68d793480cc/1642098731.0676572/events.out.tfevents.1642098731.a68d793480cc.…

Download file runs/Jan13_18-14-45_a68d793480cc/events.out.tfevents.1642097694.a68d793480cc.74.2:  90%|########…

Download file runs/Jan13_18-14-45_a68d793480cc/1642097694.8503954/events.out.tfevents.1642097694.a68d793480cc.…

Download file runs/Jan13_18-28-00_a68d793480cc/1642098485.7129517/events.out.tfevents.1642098485.a68d793480cc.…

Download file runs/Jan13_18-26-40_a68d793480cc/events.out.tfevents.1642098406.a68d793480cc.74.4:  90%|########…

Download file runs/Jan13_18-26-40_a68d793480cc/1642098406.820102/events.out.tfevents.1642098406.a68d793480cc.7…

Download file runs/Jan13_18-45-10_a68d793480cc/1642099520.566475/events.out.tfevents.1642099520.a68d793480cc.1…

Clean file runs/Jan13_18-32-01_a68d793480cc/1642098731.0676572/events.out.tfevents.1642098731.a68d793480cc.117…

Clean file runs/Jan13_18-14-45_a68d793480cc/1642097694.8503954/events.out.tfevents.1642097694.a68d793480cc.74.…

Clean file runs/Jan13_18-14-45_a68d793480cc/events.out.tfevents.1642097694.a68d793480cc.74.2:  26%|##5       |…

Clean file runs/Jan13_18-26-40_a68d793480cc/events.out.tfevents.1642098406.a68d793480cc.74.4:  26%|##5       |…

Clean file runs/Jan13_18-28-00_a68d793480cc/1642098485.7129517/events.out.tfevents.1642098485.a68d793480cc.74.…

Clean file runs/Jan13_18-26-40_a68d793480cc/1642098406.820102/events.out.tfevents.1642098406.a68d793480cc.74.5…

Clean file runs/Jan13_18-45-10_a68d793480cc/1642099520.566475/events.out.tfevents.1642099520.a68d793480cc.1555…

Download file runs/Jan13_18-28-00_a68d793480cc/events.out.tfevents.1642098485.a68d793480cc.74.6:  90%|########…

Download file training_args.bin: 100%|##########| 2.86k/2.86k [00:00<?, ?B/s]

Clean file runs/Jan13_18-28-00_a68d793480cc/events.out.tfevents.1642098485.a68d793480cc.74.6:  26%|##5       |…

Download file runs/Jan13_18-32-01_a68d793480cc/events.out.tfevents.1642098731.a68d793480cc.1177.0:  90%|######…

Clean file training_args.bin:  35%|###4      | 1.00k/2.86k [00:00<?, ?B/s]

Download file runs/Jan13_18-45-10_a68d793480cc/events.out.tfevents.1642099520.a68d793480cc.1555.0:  90%|######…

Clean file runs/Jan13_18-32-01_a68d793480cc/events.out.tfevents.1642098731.a68d793480cc.1177.0:  26%|##5      …

Clean file runs/Jan13_18-45-10_a68d793480cc/events.out.tfevents.1642099520.a68d793480cc.1555.0:  26%|##5      …

Clean file pytorch_model.bin:   0%|          | 1.00k/319M [00:00<?, ?B/s]

In [15]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 51140
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4797


Step,Training Loss
500,3.6875
1000,3.4738
1500,3.3828
2000,3.3305
2500,3.2994
3000,3.2682
3500,3.2661
4000,3.2253
4500,3.2174


Saving model checkpoint to ./distilgpt2-finetuned/checkpoint-800
Configuration saved in ./distilgpt2-finetuned/checkpoint-800/config.json
Model weights saved in ./distilgpt2-finetuned/checkpoint-800/pytorch_model.bin
Saving model checkpoint to ./distilgpt2-finetuned/checkpoint-1600
Configuration saved in ./distilgpt2-finetuned/checkpoint-1600/config.json
Model weights saved in ./distilgpt2-finetuned/checkpoint-1600/pytorch_model.bin
Saving model checkpoint to ./distilgpt2-finetuned/checkpoint-2400
Configuration saved in ./distilgpt2-finetuned/checkpoint-2400/config.json
Model weights saved in ./distilgpt2-finetuned/checkpoint-2400/pytorch_model.bin
Saving model checkpoint to ./distilgpt2-finetuned/checkpoint-3200
Configuration saved in ./distilgpt2-finetuned/checkpoint-3200/config.json
Model weights saved in ./distilgpt2-finetuned/checkpoint-3200/pytorch_model.bin
Saving model checkpoint to ./distilgpt2-finetuned/checkpoint-4000
Configuration saved in ./distilgpt2-finetuned/checkpoint-

In [16]:
trainer.push_to_hub(commit_message="Training complete!")

Saving model checkpoint to ./distilgpt2-finetuned
Configuration saved in ./distilgpt2-finetuned/config.json
Model weights saved in ./distilgpt2-finetuned/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.38k/319M [00:00<?, ?B/s]

Upload file runs/Jan14_08-19-58_3ff0fc7b05f3/events.out.tfevents.1642148525.3ff0fc7b05f3.75.0:  64%|######4   …

To https://huggingface.co/begar/distilgpt2-finetuned
   a14148d..c18f76e  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/begar/distilgpt2-finetuned
   c18f76e..ff24289  main -> main



'https://huggingface.co/begar/distilgpt2-finetuned/commit/c18f76e93ca8a487b6d25aac88f2749d0f8d576e'

In [17]:
from transformers import pipeline, AutoModelForCausalLM
max_length = 50
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
text_generator_bsl = pipeline("text-generation", 
                              max_length=max_length,
#                              tokenizer=tokenizer, 
#                              model=model,
                              device=0) # device >= 0 places the model on the GPU
text_generation = pipeline('text-generation', 
                           model="./" + model_name_ft, 
                           tokenizer=tokenizer) #,                           config={'max_length':max_length}


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_at

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
creating metadata file for /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_typ

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
creating metadata file for /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions wi

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
creating metadata file for /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
https://huggingface.co/gpt2/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpmxkq1ufp


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/gpt2/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpjbmmb_ug


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/gpt2/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
creating metadata file for /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.jso

Upoload data

In [18]:
from google.colab import files

uploaded = files.upload()

lines = []
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  with open(fn, 'r') as f:
    lines.extend(f.readlines())

Saving Conversation.csv to Conversation.csv
User uploaded file "Conversation.csv" with length 721 bytes


##Compute and output results

In [19]:
from datetime import datetime

now = datetime.now() # current date and time
date_time = now.strftime("%m%d%Y-%H%M%S")

print("date and time:",date_time)	
gen_dict = {'bsl' : text_generator_bsl, 'ft': text_generation}

for gen_name, generator in gen_dict.items():
  filename = 'Results-%s_%s.txt'%(gen_name, date_time)
  results = generator([line.strip() for line in lines])
  with open(file=filename, mode='w') as f:
    for result in results:
      text = result[0]['generated_text'].replace("\n", "")
      f.write(text + '\n')
  files.download(filename)

Disabling tokenizer parallelism, we're using DataLoader multithreading already
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


date and time: 01142022-093225


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>