# LSTM for lyric classification Poopy vs Bodo

In [1]:
import pandas as pd
import numpy as np

In [None]:
dataset = pd.read_csv('./bodo_poopy.csv')

## Data preprocessing

In [5]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Label,text
0,0,bodo,Sasa-miandry azy eo ianao\r\nAngamba izy efa l...
1,1,bodo,"Afaka ny Tahotro,\nVoafidy ankehitriny\nTsy mb..."
2,2,bodo,Ny fitiavantsika roa\r\nKanto tokoa\r\nNofinof...
3,3,bodo,Ambarambarao amin'ny dada\nAmbarambarao amin'n...
4,4,bodo,Dia kotsan-dranomaso indray\r\nNy tavako andra...


In [6]:
dataset.shape

(156, 3)

In [7]:
print("The labels inside are: ", dataset['Label'].unique())

The labels inside are:  ['bodo' 'poopy']


In [8]:
# looking for empty or null row
print(dataset.isnull().sum())

Unnamed: 0    0
Label         0
text          0
dtype: int64


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  156 non-null    int64 
 1   Label       156 non-null    object
 2   text        156 non-null    object
dtypes: int64(1), object(2)
memory usage: 3.8+ KB


The data does not show any empty or irrelevant element

## Tokenization

In [10]:
!pip install torchtext sentencepiece


Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


Because we are using malagasy-based dataset, for sure, there are very few of pre-trained word embeddings, this is why we are to use the method of "sentencePiece"

In [11]:
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#from torchtext.data.utils import get_tokenizer
#from torchtext.vocab import FastText
from torch.nn.utils.rnn import pad_sequence
import re
from tqdm import tqdm

In [24]:
# splitting the dataset into train and test and only using the train dataset to even build the vocabulary
X = dataset['text'].tolist()
y = dataset['Label'].tolist()
X_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Simple text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply the cleaning function to the 'text' column in your dataframe
dataset['cleaned_text'] = dataset['text'].apply(clean_text)

# Show the cleaned text
print(dataset[['text', 'cleaned_text']].head())

                                                text  \
0  Sasa-miandry azy eo ianao\r\nAngamba izy efa l...   
1  Afaka ny Tahotro,\nVoafidy ankehitriny\nTsy mb...   
2  Ny fitiavantsika roa\r\nKanto tokoa\r\nNofinof...   
3  Ambarambarao amin'ny dada\nAmbarambarao amin'n...   
4  Dia kotsan-dranomaso indray\r\nNy tavako andra...   

                                        cleaned_text  
0  sasamiandry azy eo ianao\r\nangamba izy efa la...  
1  afaka ny tahotro\nvoafidy ankehitriny\ntsy mba...  
2  ny fitiavantsika roa\r\nkanto tokoa\r\nnofinof...  
3  ambarambarao aminny dada\nambarambarao aminny ...  
4  dia kotsandranomaso indray\r\nny tavako andrao...  


In [13]:
# Save the cleaned text to a temporary file
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for text in dataset['cleaned_text']:
        f.write(text + '\n')


In [16]:
# Train the SentencePiece model
spm.SentencePieceTrainer.train(input='corpus.txt', model_prefix='mymodel', vocab_size=1500, character_coverage=0.9995)

# The two files created: mymodel.model and mymodel.vocab


We are to use this model for tokenization of the training and the test data with LSTM

In [20]:
# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor(model_file='mymodel.model')

# Tokenize the cleaned text from the DataFrame
dataset['tokenized_text'] = dataset['cleaned_text'].apply(lambda x: sp.encode_as_ids(x))

# Display the tokenized text
print(dataset[['cleaned_text', 'tokenized_text']].head())


                                        cleaned_text  \
0  sasamiandry azy eo ianao\r\nangamba izy efa la...   
1  afaka ny tahotro\nvoafidy ankehitriny\ntsy mba...   
2  ny fitiavantsika roa\r\nkanto tokoa\r\nnofinof...   
3  ambarambarao aminny dada\nambarambarao aminny ...   
4  dia kotsandranomaso indray\r\nny tavako andrao...   

                                      tokenized_text  
0  [1324, 1404, 6, 302, 3, 78, 14, 209, 173, 67, ...  
1  [714, 3, 4, 1217, 8, 858, 663, 6, 868, 3, 5, 2...  
2  [3, 4, 1292, 169, 1192, 106, 10, 1066, 330, 28...  
3  [498, 1060, 8, 92, 4, 266, 498, 1060, 8, 92, 4...  
4  [19, 3, 20, 99, 103, 294, 1397, 98, 3, 4, 978,...  


Here, we are to padd for the sequences to have thhe same length

In [22]:

# Pad the sequences to make sure they all have the same length
tokenized_data = [torch.tensor(seq) for seq in dataset['tokenized_text']]
padded_data = pad_sequence(tokenized_data, batch_first=True, padding_value=0)  # Padding value can be 0

# Print padded sequences
print(padded_data)


tensor([[1324, 1404,    6,  ...,    0,    0,    0],
        [ 714,    3,    4,  ...,    0,    0,    0],
        [   3,    4, 1292,  ...,    0,    0,    0],
        ...,
        [ 857,  470,  493,  ...,    0,    0,    0],
        [ 282,  219,  738,  ...,    0,    0,    0],
        [  24,  561,   15,  ...,    0,    0,    0]])


In [26]:
#label one-hot encodeer
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()

# Encode the labels
encoded_labels = label_encoder.fit_transform(dataset['Label'])

# Convert to a PyTorch tensor
labels = torch.tensor(encoded_labels)

# Print the shape and unique values
print(f"Encoded labels: {labels}, Shape: {labels.shape}")


Encoded labels: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), Shape: torch.Size([156])


In [27]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    padded_data,
    labels,
    test_size=0.2,
    random_state=42,

)

In [38]:
X_train.size()

torch.Size([124, 419])

In [39]:
X_val.size()

torch.Size([32, 419])

In [28]:
class LyricsDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [30]:
# Create datasets
train_dataset = LyricsDataset(X_train, y_train)
val_dataset = LyricsDataset(X_val, y_val)


In [31]:
# Define batch size
batch_size = 32

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [32]:
def calculate_accuracy(outputs, labels):
    # Get predicted class by finding the max logit
    _, predicted = torch.max(outputs, dim=1)
    correct = (predicted == labels).sum().item()
    accuracy = correct / labels.size(0)
    return accuracy

In [45]:
# Example LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, 128)  # Input vocab size
        self.lstm = nn.LSTM(128, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)  # Embedding layer
        _, (hidden, _) = self.lstm(x)  # Use the hidden state from LSTM
        out = self.fc(hidden[-1])     # Pass through a fully connected layer
        return out

# Hyperparameters
input_size = 1500  # Vocabulary size from SentencePiece
hidden_size = 128
output_size = len(set(labels.tolist()))  # Number of unique labels (classes)
batch_size = 32
learning_rate = 0.001
num_epochs = 10
num_layers = 3

# Initialize model, loss function, and optimizer
model = LSTMClassifier(input_size, hidden_size, output_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_accuracy = 0.0

    # Training

    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        # Track training loss and accuracy
        train_loss += loss.item()
        train_accuracy += calculate_accuracy(outputs, batch_labels)

    # Average loss and accuracy for training
    train_loss /= len(train_loader)
    train_accuracy /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for val_features, val_labels in val_loader:
            val_outputs = model(val_features)
            val_loss += criterion(val_outputs, val_labels).item()
            val_accuracy += calculate_accuracy(val_outputs, val_labels)

    # Average loss and accuracy for validation
    val_loss /= len(val_loader)
    val_accuracy /= len(val_loader)

    # Print metrics for this epoch
    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")


Epoch 1/10, Train Loss: 0.6883, Train Acc: 0.5391, Val Loss: 0.6731, Val Acc: 0.6250
Epoch 2/10, Train Loss: 0.6850, Train Acc: 0.5792, Val Loss: 0.6577, Val Acc: 0.6250


KeyboardInterrupt: 