In [48]:
import re, torch, torch.nn as nn

In [49]:
docs = [
    "Movies are fun for everyone",
    "Watching movies is great fun",
    "I like to watch movies with my friends",
]

In [50]:
labels = [1, 1, 2]  # 3 labels for 3 documents
num_classes = len(set(labels))

In [51]:
def tokenize(text):
    return re.findall(r"\w+", text.lower()) #find all words in text

tokenize("I like to watch movies with my friends")

['i', 'like', 'to', 'watch', 'movies', 'with', 'my', 'friends']

In [52]:
def get_vocabulary(texts):
    tokens = {token for text in texts for token in tokenize(text)}
    print(tokens)
    return {word: idx for idx, word in enumerate(sorted(tokens))}
vocabulary = get_vocabulary(docs)
vocabulary

{'with', 'movies', 'to', 'is', 'are', 'my', 'watch', 'i', 'great', 'friends', 'for', 'watching', 'like', 'fun', 'everyone'}


{'are': 0,
 'everyone': 1,
 'for': 2,
 'friends': 3,
 'fun': 4,
 'great': 5,
 'i': 6,
 'is': 7,
 'like': 8,
 'movies': 9,
 'my': 10,
 'to': 11,
 'watch': 12,
 'watching': 13,
 'with': 14}

In [53]:
# Convert document in to feature vector
def doc_to_bow(doc, vocabulary):
    tokens = set(tokenize(doc))
    bow = [0] * len(vocabulary)
    for token in tokens:
        if token in vocabulary:
            bow[vocabulary[token]] = 1
    return bow

In [54]:
vectors = torch.tensor([doc_to_bow(doc, vocabulary) for doc in docs], dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.long) - 1 #convert to 0-indexed

In [55]:
input_dim = len(vocabulary)
hidden_dim = 100
output_dim = num_classes

model = nn.Sequential(
    nn.Linear(input_dim, hidden_dim),
    nn.ReLU(),
    nn.Linear(hidden_dim, output_dim),
    nn.Softmax(dim=1)
)

In [56]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleClassifier(input_dim, hidden_dim, output_dim)

In [57]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for step in range(1000):
    optimizer.zero_grad()
    loss = criterion(model(vectors), labels)
    loss.backward()
    optimizer.step()

In [58]:
new_docs = ["I like to watch good movies", "I like to watch bad movies"]
class_names = ["Cinema", "Music", "Science"]
new_vectors = torch.tensor([doc_to_bow(doc, vocabulary) for doc in new_docs], dtype=torch.float32)
with torch.no_grad():
    predictions = model(new_vectors)
    predicted_ids = torch.argmax(predictions, dim=1) + 1 #convert to 1-indexed

for i, new_doc in enumerate(new_docs):
    print(f"Document: {new_doc}")
    print(f"Predicted class: {class_names[predicted_ids[i] - 1]}")
    print("-" * 100)


Document: I like to watch good movies
Predicted class: Music
----------------------------------------------------------------------------------------------------
Document: I like to watch bad movies
Predicted class: Music
----------------------------------------------------------------------------------------------------
