In [3]:
'''
Supervised Learning model that identifies words that start with a vowel.
That's it.
'''
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# Load the dataset, will be split into train/test
# Comma separated words and 1 or 0 to denote starts w vowel or not, eg "apple,1"

training_file = "/content/drive/MyDrive/Colab Notebooks/ClassifyVowels/starts_with_vowel_all.txt"
df = pd.read_csv(training_file, dtype=str, keep_default_na=False)
df_texts = df['text'].values
df_labels = df['label'].values

print("Input text size: ", len(df_texts))

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df_labels)

# Split the data into training and test sets
texts_train, texts_test, labels_train, labels_test = train_test_split(
    df_texts, encoded_labels, test_size=0.2,
    random_state=42  # specify the random seed
)

ascii_vocab = {chr(i): i-32 for i in range(32, 126)}  # printable ascii
def get_vocab_idx(word: str) -> int:
  if len(word) > 0 and word[0] in ascii_vocab:
    return ascii_vocab[word[0]]
  return 0

# Define a custom dataset
class FirstCharacterTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # tokens = self.tokenizer(text)
        # token_ids = [self.vocab[token] for token in tokens]
        token_ids = [ get_vocab_idx(text) ]
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Tokenizer and Vocabulary
tokenizer = get_tokenizer('basic_english')
# tokenizer = get_tokenizer()  # Use a basic split function as the tokenizer
# vocab = build_vocab_from_iterator(map(tokenizer, df_texts), specials=["<unk>"])
# vocab.set_default_index(vocab["<unk>"])

#print("Vocab: ", end="")
#for token in list(df_texts):
#  print(vocab[token], " ", end="")

# Create dataset and dataloader
train_dataset = FirstCharacterTextDataset(texts_train, labels_train, tokenizer)
test_dataset = FirstCharacterTextDataset(texts_test, labels_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: x)

# Define the model
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

# Initialize the model, loss function, and optimizer
num_class = len(label_encoder.classes_)
vocab_size = len(ascii_vocab)
embed_dim = 64

model = TextClassificationModel(vocab_size, embed_dim, num_class)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

def generate_batch(batch):
  '''
  Function to generate offsets and texts.

  :returns: the text, offsets, and labels for that batch
  '''
    label = torch.tensor([entry[1] for entry in batch])
    text = [entry[0] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)

    #print("generate_batch text ", text[0:7], " labels ", label[0:7])
    return text, offsets, label

# Training loop
for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        text, offsets, cls = generate_batch(batch)
        output = model(text, offsets)
        loss = criterion(output, cls)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch: {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            text, offsets, cls = generate_batch(batch)
            output = model(text, offsets)
            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.tolist())
            all_labels.extend(cls.tolist())

    print("evaluate model, labels: ", all_labels[0:10], " preds ", all_preds[0:10])
    return all_labels, all_preds

# Evaluate on test set
labels_test, preds_test = evaluate_model(model, test_loader)
accuracy = accuracy_score(labels_test, preds_test)
print(f'Test Accuracy: {accuracy:.4f}')
print(classification_report(labels_test, preds_test, target_names=label_encoder.classes_))

# Function to predict on new text data
def predict(model, text, tokenizer):
    model.eval()
    token_ids = torch.tensor([ get_vocab_idx(text) ], dtype=torch.long)
    # tokens = tokenizer(text)
    # token_ids = torch.tensor([vocab[token] for token in tokens], dtype=torch.long)
    # print("predict token vocab ", token_ids)
    offsets = torch.tensor([0])
    with torch.no_grad():
        output = model(token_ids, offsets)
        predicted_label = torch.argmax(output, dim=1).item()
    return label_encoder.inverse_transform([predicted_label])[0]

# Test the prediction function
# sample_text = "your sample text here"
sample_texts = ["apple", "pear", "cinnamon", "every", "underhill", "is", "the",
                "star", "a", "A", "of", "From", "Quantum", "Marginal", "Await",
                "awash", "serene", "mash", "young", "Untoward", "untoward"]
for sample_text in sample_texts:
  predicted_label = predict(model, sample_text, tokenizer)
  print(f'Text: {sample_text}  Label: {predicted_label}')


Input text size:  20234
Epoch: 1, Loss: 0.4262
Epoch: 2, Loss: 0.1727
Epoch: 3, Loss: 0.1027
Epoch: 4, Loss: 0.0703
Epoch: 5, Loss: 0.0521
Epoch: 6, Loss: 0.0406
Epoch: 7, Loss: 0.0328
Epoch: 8, Loss: 0.0272
Epoch: 9, Loss: 0.0231
Epoch: 10, Loss: 0.0200
evaluate model, labels:  [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]  preds  [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
Test Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3074
           1       1.00      1.00      1.00       973

    accuracy                           1.00      4047
   macro avg       1.00      1.00      1.00      4047
weighted avg       1.00      1.00      1.00      4047

Text: apple  Label: 1
Text: pear  Label: 0
Text: cinnamon  Label: 0
Text: every  Label: 1
Text: underhill  Label: 1
Text: is  Label: 1
Text: the  Label: 0
Text: star  Label: 0
Text: a  Label: 1
Text: A  Label: 1
Text: of  Label: 1
Text: From  Label: 0
Text: Quantum  Label: 0
Text: Marginal  Label: 0
