In [84]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json
import lyricsgenius
import re
from langdetect import detect
import pandas as pd

In [63]:
with open("secret.txt") as json_file:
    secrets = json.load(json_file)
    genius_secrets = secrets[0]
    spotify_secrets = secrets[1]

# Getting the data

In [64]:
# Replace with your own Spotify API credentials
client_id = spotify_secrets["client_id"]
client_secret = spotify_secrets["secret"]


rap_classics_playlist = "37i9dQZF1DXbYUVKgXZtWJ"
newer_rap = "37i9dQZF1DWWCXJuOvP8z4"


In [82]:
def get_artists_from_playlist(playlist_id):
    # Initialize the Spotify client
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

    # Get the tracks in the playlist
    results = sp.playlist_tracks(playlist_id, fields='items(track(name, artists(name)))')

    # Extract the artists from each track
    artists = set()
    for item in results['items']:
        track = item['track']
        track_artists = track['artists']
        for artist in track_artists:
            artists.add(artist['name'])
            
    return artists

classics_artists = get_artists_from_playlist(rap_classics_playlist)
newer_artists = get_artists_from_playlist(newer_rap)

artists = classics_artists.union(newer_artists)
artists.remove("Petri Nygård")
print (artists)

{'Cheek', 'A36', 'Anonymuz', 'El Migu', 'T Swoop', 'costee', 'william', 'Aksim', 'Kärkiryhmä', 'Paperi T', 'Ruudolf', 'Gasellit', 'Väinöväinö', 'Johanna Kurkela', 'MD$', 'Jami Faltin', 'Nutt-So', 'Blacflaco', 'Bizi', 'Mikael Gabriel', 'Jurassikki', 'Loost Koos', 'Haamu', 'Julma Henri', 'Heviteemu', 'Ezkimo', 'Asa', 'NCO', 'Nikke Ankara', 'TIPPA', 'Turisti', 'Sonia', 'KERZA', 'Mjay', 'Stanko Aloha', 'Herrasmiesliiga', 'Tommishock', 'Spekti', '$auli', 'Chebaleba', 'Ocyris', 'DJ Ibusal', 'Skandaali', 'Sexmane', 'Kalifornia-Keke', 'J€AN-MARC', 'Fintelligens', 'Cledos', 'Pesso', 'MKDMSK', 'Ts', 'Costi', 'Fabe', 'Puppa J', 'Brädi', 'Flegmaatikot', 'YB026', 'Don P', 'Petos', 'Tasis', 'Clever', 'Timo Pieni Huijaus', 'Gracias', 'Alexander Mack', 'Pyhimys', 'Iso H', 'MC Taakibörsta', 'RicoWamos', 'Xavier Weeks', 'Stepa', 'Averagekidluke', 'Päkä', 'Elias Gould', 'LEWI', 'VilleGalle', 'Ceebrolistics', 'Etta', 'Tuomas Kauhanen', 'Karim B', 'Tupla W', 'Yeboyah', 'ROXANA', 'Rich Brian', 'M//O', 'Perj

In [90]:
genius = lyricsgenius.Genius(genius_secrets["token"])
genius.skip_non_songs = True
genius.timeout = 10
genius.retries = 3
rap_lyrics_df = pd.DataFrame(columns=["name", "lyrics", "clean_lyrics"])
for artist in artists:
    # First we check if the artist exist with only one song before fetching multiple songs
    genius_artist = genius.search_artist(
        artist,
        max_songs=1,
        include_features=False,
        get_full_info=False,
        allow_name_change=False
        )
    try:
        genius_name = genius_artist.name
    except:
        genius_name = None
    if artist == genius_name:
        genius_artist = genius.search_artist(
        artist,
        max_songs=10,
        include_features=False,
        get_full_info=False,
        allow_name_change=False
        )
        try:
            for song in genius_artist.songs:
                full_title = song.full_title.replace('\xa0', ' ')
                track_name = full_title.split(" by ")[0]
                title_artist = full_title.split(" by ")[1]
                lyrics = song.lyrics
                try:
                    lang = detect(lyrics) 
                except:
                    lang = None
                    
                # Even though allow_name_change is set to False, it can happen.
                # Check also if language is in finnish
                if title_artist == artist and lang == "fi":
                    
                    lyrics = re.sub(r"\[.*?\]", "\n", lyrics)
                    clean_lyrics = re.sub(r'[^\w\s]', '', lyrics)
                    clean_track_name  = re.sub(r'[^\w\s]', '', track_name)
                    filename = f"data/rap_lyrics/{clean_track_name}.txt"
                    rap_lyrics_df.loc[len(rap_lyrics_df.index)] = [clean_track_name, lyrics, clean_lyrics]
                    try:
                        with open(filename, 'w', encoding="utf-8") as f:
                                f.write(lyrics)
                    except FileNotFoundError:
                        pass
        except AttributeError:
            # artist not found
            pass
        

Searching for songs by Cheek...

Song 1: "Come Down"
Song 2: "Heart Don’t Stand a Chance"
Song 3: "Bubblin"
Song 4: "The Bird"
Song 5: "Am I Wrong"
Song 6: "The Season / Carry Me"
Song 7: "Tints"
Song 8: "Put Me Thru"
Song 9: "Trippy"
Song 10: "Silicon Valley"

Reached user-specified song limit (10).
Done. Found 10 songs.
Searching for songs by A36...

Song 1: "Samma gamla vanliga (Cledos, ibe & Averagekidluke Remix)"
Song 2: "Samma gamla vanliga"
Song 3: "Samma gamla vanliga (Branco & Kamelen Remix)"
Song 4: "BLOCK"
Song 5: "ALIEN"
Song 6: "Motorola"
Song 7: "Alé Alé"
Song 8: "Casa de papel"
Song 9: "Neighborhood Hero"
Song 10: "Tamaka"

Reached user-specified song limit (10).
Done. Found 10 songs.
Searching for songs by A36...

Song 1: "Samma gamla vanliga (Cledos, ibe & Averagekidluke Remix)"

Reached user-specified song limit (1).
Done. Found 1 songs.
Searching for songs by Anonymuz...

Song 1: "Urameshi"
Song 2: "The Fall of Earth"
Song 3: "No Threat"
Song 4: "Evangelion X"
Song 5

In [103]:
artist = "Petri Nygård"
petri_lyrics_df = pd.DataFrame(columns=["name", "lyrics", "clean_lyrics"])
genius_artist = genius.search_artist(
artist,
max_songs=1000,
include_features=False,
get_full_info=False,
allow_name_change=False
)
for song in genius_artist.songs:
    full_title = song.full_title.replace('\xa0', ' ')
    track_name = full_title.split(" by ")[0]
    title_artist = full_title.split(" by ")[1]
    lyrics = song.lyrics
        
    lyrics = re.sub(r"\[.*?\]", "", lyrics)
    clean_lyrics = re.sub(r'[^\w\s]', '', lyrics)
    clean_track_name  = re.sub(r'[^\w\s]', '', track_name)
    filename = f"data/petri_lyrics/{clean_track_name}.txt"
    petri_lyrics_df.loc[len(petri_lyrics_df.index)] = [clean_track_name, lyrics, clean_lyrics]
    try:
        with open(filename, 'w', encoding="utf-8") as f:
                f.write(clean_lyrics)
    except FileNotFoundError:
        pass


Searching for songs by Petri Nygård...

Song 1: "Selvä päivä"
Song 2: "Kotibileet"
Song 3: "Paska maailma"
Song 4: "Nössö"
Song 5: "Pillumagneetti"
Song 6: "Märkää"
Song 7: "Onko sulla pokkaa?"
Song 8: "Villi ja vitun vapaa"
Song 9: "Kippis kulaus"
Song 10: "Vitun suomirokki"
Song 11: "Kerran kesässä"
Song 12: "Ryöstö"
Song 13: "Näytä tissit"
Song 14: "Päästä(n) höyryy"
Song 15: "Pannaan Suomi Kuntoon"
Song 16: "Selvä päivä - feat. lord est"
Song 17: "Luxusta"
Song 18: "Sanon suoraan"
Song 19: "Otan kaljaa"
Song 20: "VITUTTAA"
Song 21: "#ihanaa"
Song 22: "Sarvet esiin"
Song 23: "Mee vittuun Petri!"
Song 24: "3 asiaa"
Song 25: "Kaikkee pitää olla!"
Song 26: "Jatkoille"
Song 27: "Haista vittu"
Song 28: "Olen vaatimaton"
Song 29: "Mua vituttaa"
Song 30: "Mä runkkaan"
Song 31: "Poliisi on kiva"
Song 32: "Neljä Vuodenaikaa"
Song 33: "Pommi"
Song 34: "Mitävittuuvaan"
Song 35: "Mee kyykkyyn"
Song 36: "Pidetään hauskaa"
Song 37: "Mä oon hevari"
Song 38: "Valmis mihin vaan"
Song 39: "Seopetriii

In [104]:
petri_lyrics_df

Unnamed: 0,name,lyrics,clean_lyrics
0,Selvä päivä,"Selvä päivä Lyrics(Kauhee jano vieläkin, siis ...",Selvä päivä LyricsKauhee jano vieläkin siis sa...
1,Kotibileet,"Kotibileet Lyrics\nNonnii, tilipäivä, vedän ko...",Kotibileet Lyrics\nNonnii tilipäivä vedän koko...
2,Paska maailma,Paska maailma LyricsVittu mitä paskaa. Mee sin...,Paska maailma LyricsVittu mitä paskaa Mee sinä...
3,Nössö,Nössö Lyrics\n(Se on Petri!)\n\nKiva olla nöss...,Nössö Lyrics\nSe on Petri\n\nKiva olla nössön ...
4,Pillumagneetti,"Pillumagneetti LyricsNonii, se on Petrii-i. Ky...",Pillumagneetti LyricsNonii se on Petriii Kyllä...
...,...,...,...
86,Outo naapuri,Outo naapuri LyricsKaikkien tuntema lähiön sek...,Outo naapuri LyricsKaikkien tuntema lähiön sek...
87,Sunnuntaibuffet,"Sunnuntaibuffet LyricsSunnuntaibuffet, ei tarv...",Sunnuntaibuffet LyricsSunnuntaibuffet ei tarvi...
88,Mitä jos,Mitä jos? LyricsMitä jos?\nMitä jos Petri Nygå...,Mitä jos LyricsMitä jos\nMitä jos Petri Nygård...
89,Intro,Intro LyricsKen on se jolla niin levee hattu?\...,Intro LyricsKen on se jolla niin levee hattu\n...


In [119]:
# This might mess up the whole idea of the model but I'm tired. Now the lyrics are saved in a single text file.
rap_lyrics = open("data/rap_lyrics.txt", "w", encoding="utf-8")
for idx, item in rap_lyrics_df.iterrows():
    rap_lyrics.write(item["clean_lyrics"])
    rap_lyrics.write("\n\n")
rap_lyrics.close()

petri_lyrics = open("data/petri_lyrics.txt", "w", encoding="utf-8")
for idx, item in petri_lyrics_df.iterrows():
    petri_lyrics.write(item["clean_lyrics"])
    petri_lyrics.write("\n\n")
petri_lyrics.close()

Now we have our data. We have ~2000 finnish rap lyrics from popular artists, and ~100 rap lyrics from Petri Nygård. They have been saved to location data/rap_lyrics/ and data/petri_lyrics respectively. One rap song lyrics are saved to a single text file which is named as the title of the song.

# Training our model

In [3]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast, GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model = GPT2LMHeadModel.from_pretrained('Finnish-NLP/gpt2-finnish')
tokenizer = GPT2Tokenizer.from_pretrained('Finnish-NLP/gpt2-finnish')

In [5]:
rap_lyrics_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="data/rap_lyrics.txt",
    block_size=1024,
    overwrite_cache=False,
)
petri_lyrics_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="data/petri_lyrics.txt",
    block_size=1024,
    overwrite_cache=False,
)



In [11]:
def load_dataset(file_path, tokenizer, block_size = 1024):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path, model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [14]:
test_dataset = load_dataset('data/petri_lyrics.txt', tokenizer)
print(test_dataset)

<transformers.data.datasets.language_modeling.TextDataset object at 0x0000024D0BDE1F30>




In [11]:
train_file_path = 'data/petri_lyrics.txt'
model_name = 'Finnish-NLP/gpt2-finnish'
output_dir = 'model/'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 2.0
save_steps = 500
train(train_file_path=train_file_path,
      model_name=model_name,
      output_dir=output_dir,
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_train_batch_size,
      num_train_epochs=num_train_epochs)

***** Running training *****
  Num examples = 50
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 14
  Number of trainable parameters = 124439808
100%|██████████| 14/14 [13:49<00:00, 49.95s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 14/14 [13:49<00:00, 59.27s/it]
Saving model checkpoint to model/
Configuration saved in model/config.json


{'train_runtime': 829.6598, 'train_samples_per_second': 0.121, 'train_steps_per_second': 0.017, 'train_loss': 14.149287632533483, 'epoch': 2.0}


Configuration saved in model/generation_config.json
Model weights saved in model/pytorch_model.bin


In [4]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "model/"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [5]:
generate_text("Tiputtaa märkää", 200)

Tiputtaa märkää lattiaa niin, että sitä särkee. En muista että milloin ois itketty. Mutta nyt on alkanu särkee myös alapäätä, mutta ei se kyllä paljon häiritse. Mutta kyl mulla silti se on ärsyttävä. Ja kyl se on ärsyttävää ku se alkaa kyl ottaa aivoon ku mä en oikee tiiä et miten se ees liittyy siihen. Sit ku mä oon vähän iso, ni mä vaa yritän kyl pitää sitä pystyssä! Eli mulla on tosi huono itseluottamus ku mä vaa yritän aina pitää sitä pystyssä! Ja kyl mä vaa tykkään ostaa ittestäni kaikkee ihanaa. Jos mä oon oikee shoppailee, ni mä vaa teen kaikkee tyhmää. Ja mä vaa teen tosi paljo kaikkee kivaa. Ja sit mä vaa shoppailee vähän. Ja on kai vähä kivaa joskus vähä vähä vähä vähä vähä vähän vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä vähä
