In [34]:
import re
import torch
import torch.nn as nn

torch.manual_seed(42) # reproducible

<torch._C.Generator at 0x10953c690>

In [27]:
# we're going to buy a two-layer neural network that classifies documents into 3 categories
docs = [
    "Movies are fun for everyone.",
    "Watching movies is great fun.",
    "Enjoy a great movie today.",
    "Research is interesting and important.",
    "Learning math is very important.",
    "Science discovery is interesting.",
    "Rock is great to listen to.",
    "Listen to music for fun.",
    "Music is fun for everyone.",
    "Listen to folk music!"
]

labels_raw = [1, 1, 1, 3, 3, 3, 2, 2, 2, 2] # classify docs into classes: 1 = Cinema, 2 = Science, 3 = Music
num_classes = len(set(labels_raw))

num_classes

3

In [16]:
# tokenize given text
def tokenize(text):
    return re.findall(r"\w+", text.lower())

assert ['movies', 'are', 'fun', 'for', 'everyone'] == tokenize(docs[0])
assert ['watching', 'movies', 'is', 'great', 'fun'] == tokenize(docs[1])

tokenize(docs[0])

['movies', 'are', 'fun', 'for', 'everyone']

In [18]:
def get_vocabulary(texts):
    tokens = {token for text in texts for token in tokenize(text)}
    return {word: idx for idx, word in enumerate(sorted(tokens))}

vocabulary = get_vocabulary(docs)
vocabulary

{'a': 0,
 'and': 1,
 'are': 2,
 'discovery': 3,
 'enjoy': 4,
 'everyone': 5,
 'folk': 6,
 'for': 7,
 'fun': 8,
 'great': 9,
 'important': 10,
 'interesting': 11,
 'is': 12,
 'learning': 13,
 'listen': 14,
 'math': 15,
 'movie': 16,
 'movies': 17,
 'music': 18,
 'research': 19,
 'rock': 20,
 'science': 21,
 'to': 22,
 'today': 23,
 'very': 24,
 'watching': 25}

In [23]:
# extract features, convert document into feature vector (bag of words)
def doc_to_bow(doc, vocabulary):
    tokens = set(tokenize(doc))
    bow = [0] * len(vocabulary) # initialize vector
    for t in tokens:
        if t in vocabulary:
            bow[vocabulary[t]] = 1
    return bow

assert [0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] == doc_to_bow(docs[0], vocabulary)

In [42]:
# prepare inputs
def generator_input_vector(docs, vocabulary):
    return torch.tensor(
        [doc_to_bow(doc, vocabulary) for doc in docs],
        dtype=torch.float32
    )

vector = generator_input_vector(docs, vocabulary)
labels = torch.tensor(labels_raw, dtype=torch.long) - 1 # subtract 1 so zero-indexed

vector.shape

torch.Size([10, 26])

In [37]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleClassifier(
    input_dim = len(vocabulary),
    hidden_dim = 50,
    output_dim = num_classes
)

In [41]:
# train the model
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

for step in range(3000):
    optimizer.zero_grad() # don't need to save .grad, save memory
    loss = criterion(model(vector), labels)
    loss.backward()
    optimizer.step()

In [44]:
# test the model
new_docs = [
    "Listening to rock music is fun.",
    "I love science very much."
]

class_names = ["Cinema", "Music", "Science"]

new_doc_vectors = generator_input_vector(new_docs, vocabulary)

with torch.no_grad(): # 
    outputs = model(new_doc_vectors)
    predicted_ids = torch.argmax(outputs, dim=1) +  1

for i, new_doc in enumerate(new_docs):
    print(f'{new_doc}: {class_names[predicted_ids[i].item() - 1]}')

Listening to rock music is fun.: Music
I love science very much.: Science
