# Punjabi Song Generation

We are going to create a chatbot for generating Punjabi songs across different genres in the writing style of late Sidhu Moose Wala.
<br>
### Step 1: Understanding the Basics of the concepts used in the project
Before we dive into the project, let's cover some fundamental concepts:
<ul>
<li>Language Models (LMs): These are AI models trained on large amounts of text data to understand and generate human-like text.
<li>Transformers: A type of neural network architecture that's particularly effective for natural language processing tasks.
<li>Fine-tuning: The process of further training a pre-trained model on a specific dataset to adapt it to a particular task.
</ul>

#### Import Libraries
install the required libraries using requirements.txt file

In [1]:
# Basic Python Libraries
import random
import time

# Data
import numpy as np
import pandas as pd

# ML
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType

# evaluation
from rouge_score import rouge_scorer

# app creation
from flask import Flask, request, jsonify

# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

  from .autonotebook import tqdm as notebook_tqdm


### Step 2: Selecting an Initial Model

Finding a model is tricky because of the limited resources as of me doing this project on M2 chip Macbook. Would have used gpt2 and

In [2]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# At the start of your script, set the device
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [None]:
# load model and tokenizer
    
model_name = "gpt2" # have to find a suitable model because of limited resources
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

### Step 3: Data Collection and Preparation

We have to create a selenium bot to webscrape data from a public lyrics website I found
<br>
Let's start by scraping the links of the songs and then we can focus on collecting the lyrics

In [None]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# The target website
url = "https://www.azlyrics.com/s/sidhumoosewala.html"

driver.get(url)


try:
    # Wait for the element to be clickable
    sort_by_song_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div[2]/div[2]/a[2]"))
    )
    # Scroll to the element
    driver.execute_script("arguments[0].scrollIntoView();", sort_by_song_button)
    sort_by_song_button.click()

except ElementClickInterceptedException:
    # Handle overlay or pop-up
    close_button = driver.find_element(By.XPATH, "/html/body/div[5]/tonefuse-ad/div[2]/div[1]")
    close_button.click()
    sort_by_song_button.click()

# Wait for the page to re-load after sorting
time.sleep(2)

# Find all song links
song_links = driver.find_elements(By.CSS_SELECTOR, "div.listalbum-item a")

# List to store song data
songs_data = []

for link in song_links:
    song_url = link.get_attribute("href")
    song_title = link.text
    songs_data.append({
        "title": song_title,
        "url": song_url
    })

# Close the browser
driver.quit()

# Create a DataFrame and save to CSV
df = pd.DataFrame(songs_data)
df.to_csv("song_links.csv", index=False)

print(f"Collected {len(songs_data)} song links.")

In [None]:
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.set_page_load_timeout(30)  # Increased timeout to 30 seconds

# Read the CSV file with song links
df = pd.read_csv("sidhu_moose_wala_song_links.csv")

# List to store scraped data
songs_data = []

for index, row in df.iterrows():
    song_title = row['title']
    song_url = row['url']
    
    logging.info(f"Scraping: {song_title}")
    
    for attempt in range(3):  # Retry up to 3 times
        try:
            driver.get(song_url)
            
            # Wait for the lyrics to load
            lyrics_div = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/div[2]/div[2]/div[2]/div[5]")) 
            )
            
            # Extract lyrics
            lyrics = lyrics_div.text.strip()
            
            if lyrics:
                songs_data.append({
                    "title": song_title,
                    "artist": "Sidhu Moose Wala",
                    "lyrics": lyrics,
                    "genre": "Punjabi",
                    "url": song_url
                })
                logging.info(f"Successfully scraped lyrics for {song_title}")
                break  # Exit the retry loop on success
            else:
                logging.warning(f"Could not find lyrics for {song_title}")
                break  # Exit the retry loop if lyrics are empty
        
        except (TimeoutException, NoSuchElementException) as e:
            logging.error(f"Error scraping {song_title} on attempt {attempt + 1}: {str(e)}")
            time.sleep(5)  # Wait for 5 seconds before retrying
    
    # Random delay between 2 to 5 seconds
    time.sleep(random.uniform(2, 5))

# Close the browser
driver.quit()

# Create a DataFrame and save to CSV
df_lyrics = pd.DataFrame(songs_data)
df_lyrics.to_csv("sidhu_moose_wala_songs_with_lyrics.csv", index=False)

logging.info(f"Scraped lyrics for {len(songs_data)} songs.")


In [None]:
df_lyrics

Here are the next steps we can take:
<ol>

<li>Data Preprocessing
<li>Tokenization
<li>Model Selection and Fine-tuning
<li>Implementing Retrieval-Augmented Generation (RAG)
<li>Building the Generation Pipeline
</ol>

In [None]:
import re

def clean_lyrics(text):
    # Remove any [tags] often used for annotations
    text = re.sub(r'\[.*?\]', '', text)
    # Remove any parentheses and their contents
    text = re.sub(r'\(.*?\)', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove any non-printable characters
    text = ''.join(char for char in text if char.isprintable())
    return text

# apply cleaning
df_lyrics['cleaned_lyrics'] = df_lyrics['lyrics'].apply(clean_lyrics)

In [None]:
df_lyrics.lyrics[0]

In [None]:
df1 = df_lyrics.copy()
df1.head()

In [None]:
print(f"Total songs: {len(df1)}")
print(f"Average lyrics length: {df1['cleaned_lyrics'].str.len().mean():.2f} characters")

Let's proceed with tokenization

In [None]:
def tokenize_lyrics(text):
    return tokenizer.encode(text, truncation=True, max_length=512)

# apply tokenization
df1['tokenized_lyrics'] = df1['cleaned_lyrics'].apply(tokenize_lyrics)

print(f"Average token length: {df1['tokenized_lyrics'].apply(len).mean():.2f}")

print("\nSample of tokenized lyrics:")
print(df1['tokenized_lyrics'].iloc[0])

In [None]:
# Save the data
df1.to_csv("cleaned_sidhu_moose_wala_songs.csv", index=False)
df1.to_pickle("tokenized_sidhu_moose_wala_songs.pkl")

### Step 4: Fine-Tuning model

As I am doing this project on a Apple M2 Chip, it would be better to do a PEFT rather than full fine tuning.

Using Parameter-Efficient Fine-Tuning (PEFT) techniques like LoRA (Low-Rank Adaptation) is indeed a better approach for our task. It's more efficient in terms of computational resources and storage, and it can often lead to better results, especially with smaller datasets.

In [None]:
# load the dataset from the pickle file (if needed, otherwise use df1)
df2 = pd.read_pickle('tokenized_sidhu_moose_wala_songs.pkl')
df2.head()

Let's make dataset proper for training

In [None]:
df2 = df2[['cleaned_lyrics', 'tokenized_lyrics']]

# create a dataset object
dataset = Dataset.from_pandas(df2)

# prepare dataset for language modeling
def prepare_train_features(examples):
    inputs = tokenizer(examples['cleaned_lyrics'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs


dataset = dataset.map(prepare_train_features, batched=True, remove_columns=['cleaned_lyrics','tokenized_lyrics'])
dataset

In [None]:
# define the lora config
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['query','value']
)

# wrap the model with LoRA
model = get_peft_model(model, peft_config=peft_config)

# set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

By using  data collator, we simplify our data preparation process and ensure that our model receives properly formatted input during training, which is crucial for effective learning and generation of song lyrics.

In [None]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

In [None]:
# Set up training arguments
# i've used chatgpt to set up initial training params based on apple M2 chip
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,  
    per_device_train_batch_size=4,  # Reduced from 8 to 4 to avoid potential memory issues
    gradient_accumulation_steps=8,  # Increased from 4 to 8 to maintain effective batch size
    save_steps=50,  # Reduced from 500 to 50 because of smaller dataset
    save_total_limit=2,
    learning_rate=1e-4,  # Slightly increased from 5e-5 for potentially faster learning
    warmup_steps=10,  # Reduced from 100 due to smaller dataset
    logging_steps=10,  # Reduced from 50 for more frequent updates
    evaluation_strategy="steps",
    eval_steps=50,  # Reduced from 500 to match save_steps
    load_best_model_at_end=True,
    optim="adamw_torch",
    fp16=False,  # MPS doesn't support fp16
    bf16=False,  # MPS doesn't support bf16
    report_to="none",
    no_cuda=True,  # This ensures CUDA is not used
    weight_decay=0.01,  # Added weight decay for regularization
    max_grad_norm=1.0,  # Gradient clipping to prevent exploding gradients
    lr_scheduler_type="cosine",  # Cosine learning rate scheduler for potentially better convergence
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Start training
trainer.train()

# Save the LoRA adapter
model.save_pretrained("./sidhu_lyrics_lora_adapter")

#### create a generation pipeline

In [None]:
from transformers import pipeline

song_model = AutoModelForCausalLM.from_pretrained("sidhu_lyrics_lora_adapter")
song_tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

generator = pipeline("text-generation", model=song_model, tokenizer=song_tokenizer, device=device)

def generate_lyrics(prompt):
    generated = generator(
        prompt, 
        max_length=1000, 
        num_return_sequences=1, 
        temperature=0.7,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2
    )
    return generated[0]['generated_text']

In [None]:
# Test with a sample prompt
prompt = "sardari di ae gal" 
generated_lyrics = generate_lyrics(prompt)

print(generated_lyrics)

This output does not makes any sense.

Lets try prompt engineering with one shot inference

In [None]:
# Example usage with one-shot prompting
example_prompt = """
Title: 8 Cylinder
Lyrics:Engine 8 cylinder da
Dharti patda phire mutiyare

Sidhu Moose Wala!

68 model Shelby ni
Tere layi billo ajj chamkati
Hydraulic pava de ni
Infinity di bass rakhati
Ho kala rang hai sapp warga (haye haye)
Kala rang hai sapp warga
Ho kive dekh maare lishkare

Engine 8 cylinder da
Dharti patda phire mutiyare
Haan karde gabru nu
Tere magar gehdiyan maare

Zor la leya duniya ne
Haye jatt rokeya kade na rukda
Moosa pind support kare
Te meri pitth te Brampton pugda
Baaki puch layi lokan to
Baaki puch layi lokan to
Tu Sidhu Moose Wale baare

Engine 8 cylinder da
Dharti patda phire mutiyare
Haan karde gabru nu

Title: sardari di ae gal
Lyrics:"""

generated_lyrics = generate_lyrics(example_prompt)
print(generated_lyrics)