In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset,DataCollatorForLanguageModeling, AutoModelWithLMHead, TrainingArguments, Trainer

2023-10-21 15:37:07.074196: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-21 15:37:07.142719: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-21 15:37:07.143549: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('song_lyrics_en.csv')

In [3]:
df['tag'].value_counts()

pop        1393559
rap         964605
rock        633308
rb          155082
misc        140986
country      86658
Name: tag, dtype: int64

# Data Preprocessing

## check for data types

In [4]:
df.dtypes

Unnamed: 0        int64
title            object
tag              object
artist           object
year              int64
views             int64
features         object
lyrics           object
id                int64
language_cld3    object
language_ft      object
language         object
dtype: object

## check for missing values

In [5]:
np.sum(pd.isnull(df))

Unnamed: 0         0
title            104
tag                0
artist             0
year               0
views              0
features           0
lyrics             0
id                 0
language_cld3      0
language_ft        0
language           0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
np.sum(pd.isnull(df))

Unnamed: 0       0
title            0
tag              0
artist           0
year             0
views            0
features         0
lyrics           0
id               0
language_cld3    0
language_ft      0
language         0
dtype: int64

## remove duplicates 

In [8]:
# Create a new DataFrame with duplicate rows
subset_columns = [col for col in df.columns if col != 'lyrics']
len(df) - len(df.drop_duplicates(subset=subset_columns))

0

## filter on country music only

In [9]:
df_country = df[df['tag'] == 'country'].reset_index().drop(columns = 'index')

## reshape lyrics data to list

In [10]:
for i,item in enumerate(df_country['lyrics']):
    df_country['lyrics'][i] = df_country['lyrics'][i].split('\n')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_country['lyrics'][i] = df_country['lyrics'][i].split('\n')


## clean up lyrics data function

In [1]:
def make_lyrics_data(songs):
    text = []
    label = []
    sections = []
    for song in songs:
        section = 'missing'
        for i,line in enumerate(song):
            if i == (len(song) -1):
                break
            elif line == '':
                continue
            elif (line[0] == '['):
                if 'chorus' in line.lower():
                    section = 'Chorus'
                elif 'bridge' in line.lower():
                    section = 'Bridge'
                elif 'verse' in line.lower():
                    section = 'Verse'
            elif ((song[i+1] != '') and (song[i+1][0] != '[' )):
                text.append(line)
                label.append(song[i+1])
                sections.append(section)
            else:
                continue
    return text,label,sections

## sample song test function

In [12]:
sample_text, sample_label, sample_section = make_lyrics_data(df_country['lyrics'][1:2])

In [13]:
print (f"The first 2 lines in the sample song: {df_country['lyrics'][1][0:3]}")
print(f"text : {sample_text[0]} , label: {sample_label[0]}, section: {sample_section[0]}")

The first 2 lines in the sample song: ['[Verse 1]', 'They used to call me lightning', 'I was always quick to strike']
text : They used to call me lightning , label: I was always quick to strike, section: Verse


In [14]:
print(len(sample_text))
print(len(sample_label))
print(len(sample_section))

74
74
74


In [15]:
df_country['lyrics'][1]

['[Verse 1]',
 'They used to call me lightning',
 'I was always quick to strike',
 'Had everything I own',
 'In the saddles on my bike',
 'I had a reputation',
 'For never staying very long',
 'Just like a wild and restless drifter',
 'Like a cowboy in a song',
 '',
 '[Verse 2]',
 'I met a dark haired beauty',
 'Where they lay the whiskey down',
 'In southern Arizona',
 'In a little border town',
 'She had to dance for money',
 'In that dusty old saloon',
 'I dropped a dollar in the jukebox',
 'Played that girl a tune, yea',
 '',
 '[Chorus]',
 'Never see it coming',
 'It just hits you by surprise',
 "It's that cold place in your soul",
 'That fire in her eyes',
 'Makes you come together',
 'Like wild horses when they run',
 'Now the cards are on the table and',
 'The bullets in the gun',
 '[Verse 3]',
 'She was sitting on my lap',
 'We still had shots to kill',
 'When a man pulled up',
 'Who owned the bar',
 'In a Cadillac DeVille',
 'He grabbed her by her raven hair',
 'And threw her 

In [16]:
sample_text

['They used to call me lightning',
 'I was always quick to strike',
 'Had everything I own',
 'In the saddles on my bike',
 'I had a reputation',
 'For never staying very long',
 'Just like a wild and restless drifter',
 'I met a dark haired beauty',
 'Where they lay the whiskey down',
 'In southern Arizona',
 'In a little border town',
 'She had to dance for money',
 'In that dusty old saloon',
 'I dropped a dollar in the jukebox',
 'Never see it coming',
 'It just hits you by surprise',
 "It's that cold place in your soul",
 'That fire in her eyes',
 'Makes you come together',
 'Like wild horses when they run',
 'Now the cards are on the table and',
 'She was sitting on my lap',
 'We still had shots to kill',
 'When a man pulled up',
 'Who owned the bar',
 'In a Cadillac DeVille',
 'He grabbed her by her raven hair',
 'And threw her on the floor',
 'Said no free rides for the cowboys',
 'She jumped up and grabbed my pistol',
 "Stuck it in the fat man's back",
 'Said open up the safe'

In [17]:
sample_label

['I was always quick to strike',
 'Had everything I own',
 'In the saddles on my bike',
 'I had a reputation',
 'For never staying very long',
 'Just like a wild and restless drifter',
 'Like a cowboy in a song',
 'Where they lay the whiskey down',
 'In southern Arizona',
 'In a little border town',
 'She had to dance for money',
 'In that dusty old saloon',
 'I dropped a dollar in the jukebox',
 'Played that girl a tune, yea',
 'It just hits you by surprise',
 "It's that cold place in your soul",
 'That fire in her eyes',
 'Makes you come together',
 'Like wild horses when they run',
 'Now the cards are on the table and',
 'The bullets in the gun',
 'We still had shots to kill',
 'When a man pulled up',
 'Who owned the bar',
 'In a Cadillac DeVille',
 'He grabbed her by her raven hair',
 'And threw her on the floor',
 'Said no free rides for the cowboys',
 "That ain't what I pay you for, no",
 "Stuck it in the fat man's back",
 'Said open up the safe',
 'And put your money in the sack

## batch lyrics data preprocessing

In [18]:
text, label, section = make_lyrics_data(df_country['lyrics'])

In [19]:
print(len(text))
print(len(label))
print(len(section))

2237419
2237419
2237419


## split data into tmest, train, and validation

In [20]:
X = text
y = label

# Split your data into 80/20 train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Clear the content of train.txt
with open('train.txt', 'w') as f:
    pass  # An empty block does nothing, effectively clearing the file

# Clear the content of test.txt
with open('test.txt', 'w') as f:
    pass  # An empty block does nothing, effectively clearing the file

with open('train.txt', 'w') as f:
  for t,l in zip(X_train,y_train):
    f.write('Lyric:')
    f.write(t)
    f.write('Next:')
    f.write(l)
    f.write('\n')

with open('test.txt', 'w') as f:
  for t,l in zip(X_test,y_test):
    f.write('Lyric:')
    f.write(t)
    f.write('Next:')
    f.write(l)
    f.write('\n')

train_path = 'train.txt'
test_path = 'test.txt'

In [22]:
# Initialize GPT-2 tokenizer and model
model_name = "gpt2" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [23]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=16,)
        #special_tokens=["Lyric:", "Next:"])
    
    test_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=16,)
        #special_tokens=["Lyric:", "Next:"])

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [24]:
model = AutoModelWithLMHead.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./gpt2", 
    overwrite_output_dir=True,
    num_train_epochs=1, 
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_steps = 100, 
    save_steps=800, 
    warmup_steps=500
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)



In [None]:
trainer.train()




Step,Training Loss
500,3.8563
1000,3.3623
1500,3.2907
2000,3.2681
2500,3.2341
3000,3.2173
3500,3.1933
4000,3.1757
4500,3.1757
5000,3.1669
