## Section 0

In [1]:
# Import libraries
%matplotlib inline
import numpy as np
import pandas as pd
import os

from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, AutoModelWithLMHead
import tqdm as notebook_tqdm
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2Model, TrainingArguments, Trainer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Import data
album_path = 'data/input/Albums/'

album_list = [os.path.join(album_path, a) for a in os.listdir(album_path)]

In [3]:
album_dataframes = []

for track in album_list:
    # Step 3: List all .txt files within the subfolder
    tracks = [f.path for f in os.scandir(track) if f.is_file() and f.name.endswith('.txt')]
     
    # Step 4: Read each .txt file into a Pandas DataFrame and append to the list
    for track in tracks:
        df = pd.read_csv(track, sep='\t') #read each line, or each song ... (?)
        album_dataframes.append(df)

In [4]:
all_lyrics = pd.concat([df.iloc[:, 0] for df in album_dataframes], ignore_index=True)
final_df = pd.DataFrame({'all_lyrics': all_lyrics})

In [None]:
# try to filter bad quality data?

## Section 1 - Choosing a metric

Choosing a metric... BLEU or ROUGE?

## Section 2 - Model training

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # try nltk tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [6]:
train, test = train_test_split(all_lyrics,test_size=0.15)

In [7]:
print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 29854
Test dataset length: 5269


In [8]:
with open('data/output/trainTaylorSwift.txt', 'w', encoding='utf-8') as f:
  for t in train:
    f.write(t)
    f.write(' ')


with open('data/output/testTaylorSwift.txt', 'w', encoding='utf-8') as f:
  for t in test:
    f.write(t)
    f.write(' ')
    
train_path = 'data/output/trainTaylorSwift.txt'
test_path = 'data/output/testTaylorSwift.txt'

In [9]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [10]:
training_args = TrainingArguments(
    output_dir="./gpt2-taylorswift", 
    overwrite_output_dir=True,
    num_train_epochs=2, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    eval_steps = 100, 
    save_steps=800, 
    warmup_steps=500
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [29]:
trainer.train()

Step,Training Loss
500,4.1028
1000,3.3684


TrainOutput(global_step=1080, training_loss=3.699252404106988, metrics={'train_runtime': 3484.5114, 'train_samples_per_second': 1.239, 'train_steps_per_second': 0.31, 'total_flos': 282064748544000.0, 'train_loss': 3.699252404106988, 'epoch': 2.0})

In [19]:
trainer

<transformers.trainer.Trainer at 0x25442f79ab0>

In [12]:
model.config.to_json_file("config.json")

In [18]:
from transformers import pipeline

taylor = pipeline('text-generation', model='./gpt2-taylorswift', tokenizer=tokenizer)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


ValueError: Could not load model ./gpt2-taylorswift with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>, <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>, <class 'transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel'>). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\pipelines\base.py", line 269, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\models\auto\auto_factory.py", line 563, in from_pretrained
    return model_class.from_pretrained(
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\modeling_utils.py", line 2740, in from_pretrained
    raise EnvironmentError(
OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./gpt2-taylorswift.

while loading with TFAutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\pipelines\base.py", line 269, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\models\auto\auto_factory.py", line 563, in from_pretrained
    return model_class.from_pretrained(
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\modeling_tf_utils.py", line 2740, in from_pretrained
    raise EnvironmentError(
OSError: Error no file named tf_model.h5 or pytorch_model.bin found in directory ./gpt2-taylorswift.

while loading with GPT2LMHeadModel, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\pipelines\base.py", line 269, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\modeling_utils.py", line 2740, in from_pretrained
    raise EnvironmentError(
OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./gpt2-taylorswift.

while loading with TFGPT2LMHeadModel, an error is thrown:
Traceback (most recent call last):
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\pipelines\base.py", line 269, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
  File "C:\Users\anmatos\AppData\Local\miniconda3\envs\condaEnv\lib\site-packages\transformers\modeling_tf_utils.py", line 2740, in from_pretrained
    raise EnvironmentError(
OSError: Error no file named tf_model.h5 or pytorch_model.bin found in directory ./gpt2-taylorswift.




## ----