In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

In [6]:
# Define a custom dataset class
class LyricsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        genre = row['Genre1']
        year = row['Release_Year']
        artist = row['Artist']
        lyrics = row['Lyrics']

        # Format the input text
        input_text = f"Genre: {genre}, Release_Year: {year}, Artist: {artist}, Lyrics: {lyrics}"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()  # For text generation, labels are the same as input_ids
        }

In [4]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token # Use the EOS token as the PAD token
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
df = pd.read_csv(r'/content/lyrics_genius2.csv')

In [10]:
# Prepare the dataset
dataset = LyricsDataset(df, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [11]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



In [12]:
# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Using device: cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
for epoch in range(3):  # Nombre d'époques
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

    # Sauvegarder le modèle après chaque époque
    model.save_pretrained(f"./lyrics_generator_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"./lyrics_generator_epoch_{epoch + 1}")
    print(f"Model saved at epoch {epoch + 1}")

In [None]:

# Function to generate lyrics
def generate_lyrics(genre=None, year=None, artist=None, max_length=100):
    # Build the input prompt dynamically
    input_text = "Lyrics:"
    if genre:
        input_text = f"Genre: {genre}, " + input_text
    if year:
        input_text = f"Release_Year: {year}, " + input_text
    if artist:
        input_text = f"Artist: {artist}, " + input_text
    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    # Generate lyrics
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    # Decode and return the output
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example of generating lyrics
print(generate_lyrics(genre="Rock", year=1990, artist="Nirvana"))

In [18]:


# Charger le modèle fine-tuné
model = GPT2LMHeadModel.from_pretrained(r"C:\Users\carlf\Documents\GitHub\lyrics_generator\model\lyrics_generator_epoch_3")
tokenizer = GPT2Tokenizer.from_pretrained(r"C:\Users\carlf\Documents\GitHub\lyrics_generator\model\lyrics_generator_epoch_3")

def get_input_text(genre=None, artist=None, release_year=None) -> str:
    """
    Generates the input text for the model based on the provided information.

    Args:
    - genre (str, optional): The musical genre.
    - artist (str, optional): The name of the artist.
    - release_year (int or str, optional): The release year of the song.

    Returns:
    - str: Formatted input text for the model.
    """
    input_text = "Ecris une chanson parlant d'amour et de tristesse."
    if any([artist, genre, release_year]):  # Check if at least one argument is provided
        input_text += "Basé sur ces différentes informations: "
        if artist:
            input_text += f"Artiste: {artist}, "
        if genre:
            input_text += f"Genre: {genre}, "
        if release_year:
            input_text += f"Année de sortie: {release_year}, "
    return input_text.strip(", ")  # Remove trailing comma


def generate_lyrics(genre=None, artist=None, release_year=None, max_length=100):
    """
    Generates song lyrics based on the provided information.

    Args:
    - genre (str, optional): The musical genre.
    - artist (str, optional): The name of the artist.
    - release_year (int or str, optional): The release year of the song.
    - max_length (int, optional): The maximum length of the generated lyrics (default: 100 tokens).

    Returns:
    - str: Generated lyrics.
    """
    # Generate the input text
    input_text = get_input_text(genre=genre, artist=artist, release_year=release_year)
    # input_text = prompt = (
    # "Write a song in the style of Guizmo from 2018. "
    # "The song should talk about drugs, hoes, and violence."
    # )
    # Prepare the inputs for the model
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Configure the padding token if not already set
    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    # Generate text using the model
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=300,
        num_return_sequences=10,
        temperature=1.1,  # Controls creativity
        top_k=150,  # Limits the model's choices
        top_p=1.0,  # Nucleus sampling
        do_sample=True,  # Enables sampling
    )

    # Decode and return the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
    return generated_text


In [19]:
# Exemple de génération
genre = "french hip hop"
artist = "GUIZMO"
release_year = 2018
lyrics = generate_lyrics(artist=artist, release_year=release_year)
print(lyrics)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Ecris une chanson parlant d'amour et de tristesse.Basé sur ces différentes informations: Artiste: GUIZMO, Année de sortie: 2018, Artiste le P et R Skrt, mon histoire à mon histoire Pèse à mon histoire à l'argent j'enfume Mais tu veux qu'ces gens au bloc dans la merde Y a ce que vient voir fort à la consanguée han qui vous faver les grosses bleh Ne me doit pas pas changer connu prêt Faut ces portes et faire un joint pauvreté, remplace leurs paroles qui mécroment sur la cité Mais pas tout fait gratter des avaleurs, ma plume et détruit la fainte Jeunesse à mal souhaïse à nous ont des disques Que tu te délèves ou je mets où plus de quelques nuit, et je met pour une roupée C'est pour l'meilleur qu'entretenu, quoi on tenter à fond dans un rap à la télé À taïssage dans les corps qui nous éloignent en amour un jour toute façon Dans un


In [15]:
# Optional: Collapsible Section
html_code = f"""
<details>
    <summary style="font-size:16px; font-weight:bold; color:#0056b3;">Click to view details</summary>
    <p>{lyrics}</p>
</details>
"""
display(HTML(html_code))

In [6]:
get_input_text(genre="Rap", artist="Damso", release_year=2011)

'Generate lyrics. Based on the following information: Artist: Damso, Genre: Rap, Release_Year: 2011'