In [None]:
# before Running the notebook you will need to install the following 
# Remove the '#' from the code 


# pip install numpy==1.24
# pip install beautifulsoup4==4.11.1
# pip install selenium==4.9.0
# pip install requests==2.28.1
# pip install webdriver-manager==3.8.5
# pip install torch==1.13.0

In [3]:
import requests
import numpy as np
from bs4 import BeautifulSoup

import typing as t
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.microsoft import EdgeChromiumDriverManager

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

### Crawler

- Initiate Crawler that Take all song from daddy yankee

#### Crawler

- Scrapper class that crawls and extract all information from the website

In [14]:
class Crawler:

    """ Music Crawler
    
    #### Arguments:
        - url : main url
        - save_at : path where information will be saved"""

    def __init__(self, url: str) -> None:
        self.url = url
        self.options = self._config_options()
        self.all_songs = []
        self.lyrics_string = ""

    def _init_webdriver(self) -> webdriver:
        """ Initiate WebDriver Edge"""
        driver = webdriver.Edge(
            service=Service(EdgeChromiumDriverManager().install()),
            options=self.options,
        )
        return driver
    
    def _config_options(self) -> dict:
        """ Set up driver options"""
        self.options = Options()
        self.options.add_experimental_option("detach", True)
        return self.options
    

    def _extract_all_songs(self, url : str) -> t.Union[str, bool]:
        # Inititate browser
        brw = self._init_webdriver()
        brw.get(url)

        # Take all songs 
        list_of_songs = brw.find_elements(By.XPATH, "//ul[@class='songList-table-content js-song-list']")
        for song in list_of_songs:
            given_song = song.find_elements(By.TAG_NAME, "li")
            for link in given_song:
                self.all_songs.append(link.get_attribute("data-shareurl"))

        # Close Browser
        brw.close()
        
        # Check we have links
        if self.all_songs:
            return self.all_songs
        return False
    
    def _init_beautifulsoup (self, list_of_song: t.List[str]) -> str:
        """ Use Beautiful Soup to Get Lyrics"""

        # Loop Thorugh Each song
        for song_ in list_of_song:
            # Iniate BSoup
            web = requests.get(song_)
            if web.status_code == 200:
                bs = BeautifulSoup(web.content, "html.parser")

                # Extract Song
                for lyric in bs.find_all("div", attrs={'class':'lyric-original'}):
                    for line in lyric.find_all("p"):
                        for tag in line.contents:
                            if tag.text == "":
                                continue
                            else:
                                self.lyrics_string += f"{tag.text.lower().strip()} " 
                    self.lyrics_string += "\n" 

        return self.lyrics_string

    def StartCrawling(self):
        """ Start Crawling"""

        # List of Songs
        list_of_song = self._extract_all_songs(self.url)
        if list_of_song:
            self.lyrics_string = self._init_beautifulsoup(list_of_song)
        else:
            raise ValueError ("Error at Crawling all Lyrics")


        # Save files
        file_save_at = "daddy_yankee_song.txt"
        with open (file_save_at, "w", encoding="utf-8") as FiletoSave:
            FiletoSave.write(self.lyrics_string)

        print("-------- Done ---------")


In [15]:
url = "https://www.letras.com/daddy-yankee/"
test = Crawler(url).StartCrawling()

-------- Done ---------


### Tokenization 

- There are two types of tokenization 
1. letter by letter : which aims to predit the following word for example : hello how are you | space | r |  ?
2. Word by word: hello how -> are you | you doing ?

but of course the example are related to the song 

Remove Inncesary Tokens to speed up the training process

####

In [40]:
# Append specially tokens into list 
EOS, UNK, PAD = "<EOS>","<UNK>","<PAD>"

# All Spanish Character
all_letters_in_spanish = """
áéèíóúüabcdefghijklmnñopqrstuvwxyz0123456789 -,;.[¡¿!?]:“'’’’/\|_@#$%ˆ&*˜‘+-=()[]{}"'ÁÉÍÓÚABCDEFGHIJKLMNÑOPQRSTUVWXYZ
""" 

# Define Chars and tranforme them into list
chars = list(sorted(set(all_letters_in_spanish)))
for spc_token in [EOS, UNK, PAD]:
    # Speciall Character for Sequences Creation
    if spc_token == "<PAD>":
        chars.insert(0, spc_token)
    chars.append(spc_token)

# Creating Unique idx for each character
encoder = {k: v for v, k in enumerate(chars)}
decoder =  np.array(chars)

# char_index
def char_idx(c : str):
    if c in chars:
        return encoder[c]
    return encoder[UNK]

- Settings : all Settings are define in here

####

In [9]:
STRING_ENCODING = "utf-8"
MAX_LEN = 75  # Adjust to the maxium number of word per music
PATH = "./daddy_yankee_song.txt" # Path to the file containing the data 

- Processing letter by letter

####

In [10]:
Letterbyletter = list()
with open(PATH, "r", encoding=STRING_ENCODING) as File2Process:
    songs_ = File2Process.read().split("\n")
    # Looping Line By Line
    for line in songs_:
        single_line = line.strip()

        # Convert and pass into Decoder
        char_transformed = [char_idx(c) for c in single_line[:-1]]
        if len(char_transformed) >= MAX_LEN:
            char_transformed = char_transformed[0:MAX_LEN-1]
            char_transformed.append(encoder[EOS])
        else:
            char_transformed.append(encoder[EOS])
            remain = MAX_LEN - len(char_transformed)
            if remain > 0:
                char_transformed.extend([encoder[PAD]] * remain)
        Letterbyletter.append(char_transformed)

print("**** Data file loaded ****")

**** Data file loaded ****


In [11]:
letter_string = ""
print(Letterbyletter[0][:5])
for i in Letterbyletter[0]:
    letter_string += f"{decoder[i]} "

print(letter_string)

[65, 77, 75, 71, 76]
c o m i n g   f r o m   t h e   s h a d o w   o f   t h e   i s l a n d !   w e   a r e   w i n c h e s t e r   y a n k e e   a n d   n a s   e s c <EOS> 


- Processing Word by Word

####

In [12]:
def word_by_word_processing (lyric:t.List[str]):
    """ Tokenize word by word"""

    # Concate all songs
    all_songs = " ".join(song.strip() for song in lyric)

    # Cleaning Corpus
    corpus = set(all_songs.split(" "))

    # creating decoder 
    word_decoder = {v:num for num, v in enumerate(list(corpus) + [EOS, UNK, PAD], 0)}

    return word_decoder

def word_to_idx(word:str, tokenizer:dict):
    """ map words"""
    if tokenizer.get(word, UNK):
        return tokenizer[word]
    return tokenizer[UNK]

    

In [13]:
WordProcess = list()
with open(PATH, "r", encoding=STRING_ENCODING) as File2Process:
    # Creating Corpus
    songs_ = File2Process.read().split("\n")

    # Create decoder
    WordEncoder = word_by_word_processing(songs_)
    WordDecoder = {v:k for k, v in WordEncoder.items()}

    # Looping Line By Line
    for line in songs_:
        single_line = line.strip()

        # Convert and pass into Decoder
        char_transformed = [word_to_idx(c,WordEncoder) for c in single_line.split(" ")[:-1]]
        if len(char_transformed) >= MAX_LEN:
            char_transformed = char_transformed[0:MAX_LEN-1]
            char_transformed.append(WordEncoder[EOS])
        else:
            char_transformed.append(WordEncoder[EOS])
            remain = MAX_LEN - len(char_transformed)
            if remain > 0:
                char_transformed.extend([WordEncoder[PAD]] * remain)
        WordProcess.append(char_transformed)


In [14]:
print(WordProcess[0])
example = ""
for i in WordProcess[0]:
    example += f"{WordDecoder[i]} "

print(example)

[6285, 3390, 5095, 10289, 10000, 5095, 10580, 15298, 7850, 8150, 5302, 14966, 15585, 10127, 7332, 2722, 15731, 198, 770, 2690, 16761, 18633, 18782, 9621, 18858, 9204, 12268, 1474, 1418, 13782, 14418, 2005, 7968, 18633, 13782, 9076, 3163, 9204, 16243, 19869, 9246, 8150, 18984, 962, 5481, 905, 2811, 8259, 9506, 9202, 2313, 5907, 16145, 12899, 9204, 1476, 1474, 6226, 20521, 14367, 9626, 18569, 13990, 14367, 2324, 1474, 14367, 8739, 4545, 9506, 8208, 4966, 340, 11424, 20671]
coming from the shadow of the island! we are winchester yankee and nas escobar! pablo, what's up, pa'? treinta-treinta, 70mm metras es letal, violenta alimenta el mental de toda mi gente completa fundamenta es mi letra, representa el instrumental rápida lenta, winchester inventa en los noventa líricas respuestas para preguntas que no contestan como el misterio de cuando suenen la trompeta 666, será la marca de la bestia directo para tu frente o a <EOS> 


### Last Transformation - Numpy into `Torch Tensors` 



- Classes and Methods for the Architecture - Letter by Letter

####

In [15]:
# Converting to numpy
LetterNumpyData = torch.tensor(Letterbyletter)

# Training and Testing
np_data_in = LetterNumpyData[:, :-1]
np_data_out = LetterNumpyData[:, 1:]

# Creating TensorDataset
Letterdataset = TensorDataset(np_data_in, np_data_out)

# Creating DataLoader
BATCH_SIZE = 32
Letterdataloader = DataLoader(Letterdataset, batch_size=BATCH_SIZE, shuffle=True)


- Classes and Methods for the Architecture - word by word

####

In [16]:
# Converting to numpy
WordNumpyData = torch.tensor(WordProcess)

# Training and Testing
np_data_in = WordNumpyData[:, :-1]
np_data_out = WordNumpyData[:, 1:]

# Creating TensorDataset
Worddataset = TensorDataset(np_data_in, np_data_out)

# Creating DataLoader
BATCH_SIZE = 32
Worddataloader = DataLoader(Worddataset, batch_size=BATCH_SIZE, shuffle=True,drop_last=True)

### Module Creation 

- Classes and Methods for the Architecture

In [29]:
# Model definition
class MyGenerator(nn.Module):
    def __init__(self,
                 vocab_size : int,
                 embedding_dim : int,
                 hidden_size : int):
        """ 
        ### Arguments:
            - vocab_size: total number of vocabulary
            - embedding_dim: embedding dimension for word transformation
            - hidden_size: number of hidden layers
        """
        super(MyGenerator, self).__init__()

        # Set and Initiate Layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru1 = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.gru2 = nn.LSTM(hidden_size, 560, batch_first=True)
        self.fc = nn.Linear(560, vocab_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """ Training Model
        ####
            Arguments:
                - x: Input
        #### 
            Returns:
                - prediction : torch.tensor
        """

        # Fitting embeddings
        embedding_ = self.embedding(x)
        before_layer_1 = self.dropout(embedding_)

        # First Gru layer
        gru_1_output, _ = self.gru1(before_layer_1)
        before_layer_2 = self.dropout(gru_1_output)

        # Second Gru layer
        gru_2_output, _ = self.gru2(before_layer_2)
        before_final_dense = self.dropout(gru_2_output)

        # Final Prediction
        prediction = self.fc(before_final_dense)

        return prediction


def generate_text(model, decoder , start_string, temperature=0.95, num_generate=5):
    model.eval()

    # Convert string to numbers
    input_eval = torch.tensor([encoder[s] for s in start_string], dtype=torch.long).unsqueeze(0)

    # Text generation
    text_generated = []

    with torch.no_grad():
        for _ in range(num_generate):
            predictions = model(input_eval)
            predictions = F.softmax(predictions / temperature, dim=-1)
            predicted_id = torch.multinomial(predictions[:, -1, :], 1).item()

            input_eval = torch.tensor([[predicted_id]], dtype=torch.long)
            text_generated.append(decoder[predicted_id])

    return start_string + ' '.join(text_generated)

- Settings for Model and Training

####

In [18]:
### Settings for Letter by letter
VOC_SIZE = len(chars) # all unique characters

### Setting for word by word 
VOC_SIZE_WORD = len(WordEncoder.keys())

### General Setting
EMB_DIM = 32 # Emb dimension
HIDDEN_SIZE = 1024 # Gru Unit in internal layer
EPOCHS = 20 # Number of Training Epochs
LR = 0.001  # Learning Rate

_______

### Training

 


- Training : letter by letter

##### 

In [19]:
# Initiate Model
modelletter = MyGenerator(
    vocab_size=VOC_SIZE,
    embedding_dim= EMB_DIM,
    hidden_size=HIDDEN_SIZE
)

# Loss and Optimization 
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelletter.parameters(), lr=LR)

# Training 
for epoch in range(EPOCHS):
    # taking input, output from transformed data
    for inputs, targets in Letterdataloader:
        optimizer.zero_grad() # Zero gradients
        outputs = modelletter(inputs.long())

        # define loss
        loss = criteria(
            outputs.transpose(1, 2),
            targets.long())
        
        # back-propagination
        loss.backward()

        # Optimization
        optimizer.step()

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item()}')

print("***** End Training *****")

Epoch [1/20], Loss: 3.295713186264038
Epoch [2/20], Loss: 3.105656385421753
Epoch [3/20], Loss: 3.086557626724243
Epoch [4/20], Loss: 2.960405111312866
Epoch [5/20], Loss: 3.0316011905670166
Epoch [6/20], Loss: 2.8633739948272705
Epoch [7/20], Loss: 2.8617310523986816
Epoch [8/20], Loss: 2.9382972717285156
Epoch [9/20], Loss: 2.4017539024353027
Epoch [10/20], Loss: 2.606152296066284
Epoch [11/20], Loss: 2.546105146408081
Epoch [12/20], Loss: 2.608675718307495
Epoch [13/20], Loss: 2.5434482097625732
Epoch [14/20], Loss: 2.6174988746643066
Epoch [15/20], Loss: 2.4513144493103027
Epoch [16/20], Loss: 1.8473807573318481
Epoch [17/20], Loss: 2.334038734436035
Epoch [18/20], Loss: 2.1861321926116943
Epoch [19/20], Loss: 2.086322784423828
Epoch [20/20], Loss: 2.0540640354156494
***** End Training *****


- Training : word by word

##### 

In [20]:
# Initiate Model
modelword = MyGenerator(
    vocab_size=VOC_SIZE_WORD,
    embedding_dim= EMB_DIM,
    hidden_size=HIDDEN_SIZE
)

# Loss and Optimization 
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(modelword.parameters(), lr=LR)

# Training 
for epoch in range(EPOCHS):
    # taking input, output from transformed data
    for inputs_, targets_ in Worddataloader:
        optimizer.zero_grad() # Zero gradients
        outputs_ = modelword(inputs_.long())

        # define loss
        loss = criteria(
            outputs_.transpose(1, 2),
            targets_.long())
        
        # back-propagination
        loss.backward()

        # Optimization
        optimizer.step()

    print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item()}')

print("***** End Training *****")

Epoch [1/20], Loss: 8.0880765914917
Epoch [2/20], Loss: 7.641803741455078
Epoch [3/20], Loss: 7.34825325012207
Epoch [4/20], Loss: 7.369215488433838
Epoch [5/20], Loss: 7.200517654418945
Epoch [6/20], Loss: 7.217112064361572
Epoch [7/20], Loss: 7.193727016448975
Epoch [8/20], Loss: 7.315526008605957
Epoch [9/20], Loss: 7.240156650543213
Epoch [10/20], Loss: 7.083902835845947
Epoch [11/20], Loss: 7.112813472747803
Epoch [12/20], Loss: 7.34207010269165
Epoch [13/20], Loss: 7.223349094390869
Epoch [14/20], Loss: 7.211241245269775
Epoch [15/20], Loss: 7.1130757331848145
Epoch [16/20], Loss: 6.990081787109375
Epoch [17/20], Loss: 7.140476703643799
Epoch [18/20], Loss: 7.065708637237549
Epoch [19/20], Loss: 7.0598955154418945
Epoch [20/20], Loss: 7.0329999923706055
***** End Training *****


-----------------------
### Comparing

- letter by letter prediction 

####

In [34]:
# Example usage
start_sequence = "Dale Mas gasolina "
generated_text = generate_text(modelletter, decoder, start_sequence, temperature=0.7, num_generate=100)
print(f"Trained on Letter by Letter: \n {generated_text}")

Trained on Letter by Letter: 
 Dale Mas gasolina e s o h e n o o h i n u n o h i e n o t u h e n o h a r ú u o h ú i s u e l l o h e s i s i h e l e s o o h o h a o c o h e l o h e l e l e l e s e   ¡ s i a m u a a h e u n o h o h e l e n o h o h é


- Word by Word prediction 

####

In [44]:
start_sequence = "a ella le gusta"
generated_text = generate_text(modelword, WordDecoder, start_sequence, temperature=0.5, num_generate=35)
print(f"Trained on Letter by Letter: \n {generated_text}")

Trained on Letter by Letter: 
 a ella le gustaque aunque tu discoteca c'mon miami hielo, bailalo... (what?!) si te ziggy, menes menor están vip... yankee ti-tírale) madre sabes ey, le los el cuestión me se te me oh, yankee, hey! siente hey! daddy
