### Using LSTM

In [36]:
import nltk
import numpy as np
import random
import torch
from torch.utils.data import DataLoader
from torch.nn import BCEWithLogitsLoss, Embedding, LSTM, Linear, Module
from torch.optim import Adam

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer


# Download and load the name datasets
nltk.download('names')
names_corpus = nltk.corpus.names
male_names = [name for name in names_corpus.words('male.txt')]
female_names = [name for name in names_corpus.words('female.txt')]

# Create labels for names
male_labels = [0] * len(male_names)  # 0 for male
female_labels = [1] * len(female_names)  # 1 for female

# Combine male and female names and labels
all_names = male_names + female_names
all_labels = male_labels + female_labels

# Shuffle the combined data
combined = list(zip(all_names, all_labels))
random.shuffle(combined)
all_names, all_labels = zip(*combined)

# Split data into training and testing sets
train_names, test_names, train_labels, test_labels = train_test_split(all_names, all_labels, test_size=0.2)

# Define tokenizer and preprocess data
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(all_names)
name_sequences = tokenizer.texts_to_sequences(all_names)
padded_names = pad_sequences(name_sequences, maxlen=10)

# Convert labels to numpy arrays and tensors
all_labels = np.array(all_labels)
data_tensor = torch.tensor(padded_names, dtype=torch.long)
labels_tensor = torch.tensor(all_labels, dtype=torch.float)

# Split the data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(data_tensor, labels_tensor, test_size=0.2, random_state=42)

# DataLoader setup
train_loader = DataLoader(list(zip(train_data, train_labels)), batch_size=32, shuffle=True)
val_loader = DataLoader(list(zip(val_data, val_labels)), batch_size=32, shuffle=False)

# Define the NameClassifier neural network model
class NameClassifier(Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NameClassifier, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = Linear(hidden_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return out

# Initialize the model and optimizer
vocab_size = len(tokenizer.word_index) + 1  # Including the padding token
model = NameClassifier(vocab_size=vocab_size, embedding_dim=100, hidden_dim=128)
optimizer = Adam(model.parameters())

# Train and validate the model
for epoch in range(10):
    model.train()
    total_loss = 0
    correct_train_preds = 0
    total_train_preds = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = BCEWithLogitsLoss()(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Calculate training accuracy
        predictions = torch.round(torch.sigmoid(outputs)).squeeze()
        correct_train_preds += (predictions == targets).sum().item()
        total_train_preds += targets.size(0)

    train_accuracy = correct_train_preds / total_train_preds

    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0
    correct_val_preds = 0
    total_val_preds = 0

    for inputs, targets in val_loader:
        with torch.no_grad():
            outputs = model(inputs)
            loss = BCEWithLogitsLoss()(outputs.squeeze(), targets)
            val_loss += loss.item()
            predictions = torch.round(torch.sigmoid(outputs)).squeeze()
            correct_val_preds += (predictions == targets).sum().item()
            total_val_preds += targets.size(0)

    val_accuracy = correct_val_preds / total_val_preds

    print(f"Epoch: {epoch + 1}, Training Loss: {total_loss / len(train_loader)}, "
          f"Training Accuracy: {train_accuracy}, Validation Loss: {val_loss / len(val_loader)}, "
          f"Validation Accuracy: {val_accuracy}")


[nltk_data] Downloading package names to /home/azureuser/nltk_data...
[nltk_data]   Package names is already up-to-date!


Epoch: 1, Training Loss: 0.45739048210220723, Training Accuracy: 0.7723052714398112, Validation Loss: 0.4202038687467575, Validation Accuracy: 0.8005034612964128
Epoch: 2, Training Loss: 0.3938017393176879, Training Accuracy: 0.8124311565696302, Validation Loss: 0.4055830296874046, Validation Accuracy: 0.8030207677784771
Epoch: 3, Training Loss: 0.3592879942164349, Training Accuracy: 0.83147128245476, Validation Loss: 0.3765432271361351, Validation Accuracy: 0.8244178728760226
Epoch: 4, Training Loss: 0.33306649110125536, Training Accuracy: 0.844846577498033, Validation Loss: 0.37545796543359755, Validation Accuracy: 0.8250471994965387
Epoch: 5, Training Loss: 0.30687093087027423, Training Accuracy: 0.8599527930763179, Validation Loss: 0.3751748245954514, Validation Accuracy: 0.8231592196349906
Epoch: 6, Training Loss: 0.2788508547909895, Training Accuracy: 0.8719118804091267, Validation Loss: 0.3644944906234741, Validation Accuracy: 0.8313404657016992
Epoch: 7, Training Loss: 0.251572

### Roberta Transformers

### Importing Libraries and Preparing Dataset
In this section, we import necessary libraries and download the `names` dataset from the NLTK library. The function `load_and_shuffle_data` is defined to load, label, and shuffle names into male and female categories. This data is then split into training and testing sets.


In [16]:
import nltk
import torch
import random
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Download and prepare the dataset of names
nltk.download('names')
from nltk.corpus import names

def load_and_shuffle_data():
    """ Load male and female names from NLTK, label them, and shuffle the dataset. """
    male_names = [(name, 0) for name in names.words('male.txt')]  # Label 0 for male names
    female_names = [(name, 1) for name in names.words('female.txt')]  # Label 1 for female names
    combined_data = male_names + female_names
    random.shuffle(combined_data)
    names_list, labels_list = zip(*combined_data)
    return train_test_split(names_list, labels_list, test_size=0.2)


[nltk_data] Downloading package names to /home/azureuser/nltk_data...
[nltk_data]   Package names is already up-to-date!


### Tokenization
Here, we tokenize the names using the `RobertaTokenizerFast` from the Hugging Face library. This step converts text data into a format suitable for model training by handling tasks such as padding and truncation.


In [17]:
def encode_names(names_train, names_test):
    """ Tokenize the training and testing name data. """
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    train_encodings = tokenizer(list(names_train), truncation=True, padding=True)
    test_encodings = tokenizer(list(names_test), truncation=True, padding=True)
    return train_encodings, test_encodings


### Custom Dataset Class
We define a custom PyTorch dataset class `NamesDataset` to handle the tokenized data. This class will facilitate the loading of data during the training and evaluation phases, making it compatible with PyTorch's data handling and batching methodologies.


In [18]:
class NamesDataset(torch.utils.data.Dataset):
    """ Custom dataset for handling the names tokenized data. """
    def __init__(self, tokenized_data, labels):
        self.tokenized_data = tokenized_data
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_data.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


#####  Model Training and Evaluation

In [21]:
def calculate_metrics(prediction):
    """ Calculate metrics such as accuracy, precision, recall, and F1 score for evaluation. """
    true_labels = prediction.label_ids
    predicted_labels = prediction.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')
    accuracy = accuracy_score(true_labels, predicted_labels)
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}


### 

In [22]:
def train_model():
    """ Load data, prepare datasets, train and evaluate the RoBERTa model for sequence classification. """
    names_train, names_test, labels_train, labels_test = load_and_shuffle_data()
    names_train, _, labels_train, _ = train_test_split(names_train, labels_train, test_size=0.5)
    names_test, _, labels_test, _ = train_test_split(names_test, labels_test, test_size=0.5)

    train_encodings, test_encodings = encode_names(names_train, names_test)
    training_dataset = NamesDataset(train_encodings, labels_train)
    testing_dataset = NamesDataset(test_encodings, labels_test)
    
    roberta_classifier = RobertaForSequenceClassification.from_pretrained('roberta-base')
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        warmup_steps=200,
        weight_decay=0.01,
        logging_dir='./logs',
        evaluation_strategy="epoch"
    )

    trainer = Trainer(
        model=roberta_classifier,
        args=training_args,
        train_dataset=training_dataset,
        eval_dataset=testing_dataset,
        compute_metrics=calculate_metrics
    )
    trainer.train()
    return trainer


In [23]:

trainer = train_model()

# Evaluate the model
eval_results = trainer.evaluate()

for key, value in eval_results.items():
    print(f"{key}: {value}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.390235,0.846348,0.878,0.876248,0.87976
2,No log,0.430029,0.835013,0.860192,0.920091,0.807615
3,No log,0.375347,0.861461,0.890656,0.883629,0.897796


Attempted to log scalar metric eval_loss:
0.39023515582084656
Attempted to log scalar metric eval_accuracy:
0.8463476070528967
Attempted to log scalar metric eval_f1:
0.878
Attempted to log scalar metric eval_precision:
0.8762475049900199
Attempted to log scalar metric eval_recall:
0.8797595190380761
Attempted to log scalar metric eval_runtime:
0.1385
Attempted to log scalar metric eval_samples_per_second:
5734.561
Attempted to log scalar metric eval_steps_per_second:
93.891
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric eval_loss:
0.43002888560295105
Attempted to log scalar metric eval_accuracy:
0.8350125944584383
Attempted to log scalar metric eval_f1:
0.8601921024546425
Attempted to log scalar metric eval_precision:
0.9200913242009132
Attempted to log scalar metric eval_recall:
0.8076152304609219
Attempted to log scalar metric eval_runtime:
0.1376
Attempted to log scalar metric eval_samples_per_second:
5770.951
Attempted to log scalar metric eval_steps_per_

Attempted to log scalar metric eval_loss:
0.37534669041633606
Attempted to log scalar metric eval_accuracy:
0.8614609571788413
Attempted to log scalar metric eval_f1:
0.8906560636182903
Attempted to log scalar metric eval_precision:
0.883629191321499
Attempted to log scalar metric eval_recall:
0.8977955911823647
Attempted to log scalar metric eval_runtime:
0.1579
Attempted to log scalar metric eval_samples_per_second:
5027.001
Attempted to log scalar metric eval_steps_per_second:
82.306
Attempted to log scalar metric epoch:
3.0
eval_loss: 0.37534669041633606
eval_accuracy: 0.8614609571788413
eval_f1: 0.8906560636182903
eval_precision: 0.883629191321499
eval_recall: 0.8977955911823647
eval_runtime: 0.1579
eval_samples_per_second: 5027.001
eval_steps_per_second: 82.306
epoch: 3.0
