<!--<badge>--><a href="https://colab.research.google.com/github/huggingface/workshops/blob/main/nlp-zurich/02-text-classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a><!--</badge>-->

## Install dependencies

In [1]:
! pip install datasets transformers sentencepiece
!apt install git-lfs
!pip install sklearn


Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 20.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 64.8 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 66.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 65.8 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3

##Fetch and prepare training dataset

In [2]:
%%bash
git clone https://github.com/danielhorizon/lyrics-genreation/
mv lyrics-genreation/genius_data .
rm -r lyrics-genreation

Cloning into 'lyrics-genreation'...
Checking out files:  97% (41/42)   Checking out files: 100% (42/42)   Checking out files: 100% (42/42), done.


In [3]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import re
max_length = 100
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'
test_frac = 0.15

def read_genre(genre):
  return pd.read_csv('genius_data/%s.csv'%genre)[['artist', 'genre', 'title', 'lyrics']]

def build_text_files(df, dest_path):
    text = ''
    for i, row in df.iterrows():
      lyrics = row['lyrics']
      if str(lyrics) == 'nan':
        continue
      text += lyrics + '. '
    text = text.replace('\n\n', '.\n')
    text = re.sub(r'\s', ' ', text)
    text = re.sub(r'\.+', '.', text)    
    text = re.sub(r' +', ' ', text)

    with open(dest_path, 'w') as f:
      f.write(text)
    print(dest_path + ' length: ' + str(len(df)))
    print(str(len(text)) + ' elements')

genres = ['pop','jazz','folk','soul']

df = pd.concat([read_genre(x) for x in genres]).sample(frac=1) # mix different genres

train, test = train_test_split(df, test_size=test_frac)
build_text_files(train, train_path)
build_text_files(test, test_path)

train_dataset.txt length: 19233
25530711 elements
test_dataset.txt length: 3395
4382442 elements


In [4]:
from transformers import TextDataset, DataCollatorForLanguageModeling, AutoTokenizer

model_name = "distilgpt2"
model_name_ft = "distilgpt2-finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

!rm cached*
train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

rm: cannot remove 'cached*': No such file or directory


Token indices sequence length is longer than the specified maximum sequence length for this model (6568444 > 1024). Running this sequence through the model will result in indexing errors


##Trainer

In [5]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
batch_size = 32
model = AutoModelWithLMHead.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./" + model_name_ft, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    push_to_hub=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)



Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

ValueError: ignored

In [None]:
trainer.train()
trainer.save_model()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
trainer.push_to_hub(commit_message="Training complete!")

In [None]:
from transformers import pipeline, AutoModelForCausalLM
max_length = 50
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
text_generator_bsl = pipeline("text-generation", 
                              max_length=max_length,
#                              tokenizer=tokenizer, 
#                              model=model,
                              device=0) # device >= 0 places the model on the GPU
text_generation = pipeline('text-generation', 
                           model="./" + model_name_ft, 
                           tokenizer=tokenizer) #,                           config={'max_length':max_length}


Upoload data

In [None]:
from google.colab import files

uploaded = files.upload()

lines = []
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  with open(fn, 'r') as f:
    lines.extend(f.readlines())

##Compute and output results

In [None]:
from datetime import datetime

now = datetime.now() # current date and time
date_time = now.strftime("%m%d%Y-%H%M%S")

print("date and time:",date_time)	
gen_dict = {'bsl' : text_generator_bsl, 'ft': text_generation}

for gen_name, generator in gen_dict.items():
  filename = 'Results-%s_%s.txt'%(gen_name, date_time)
  results = generator([line.strip() for line in lines])
  with open(file=filename, mode='w') as f:
    for result in results:
      text = result[0]['generated_text'].replace("\n", "")
      f.write(text + '\n')
  files.download(filename)