In [2]:
import re, torch, torch.nn as nn

In [3]:
docs = [
    # Cinema class (label 0)
    "Movies are fun for everyone",
    "Watching movies is great fun",
    "I like to watch movies with my friends",
    "The cinema experience is amazing",
    "Film festivals showcase great movies",
    "Action movies are exciting to watch",
    "Documentary films tell real stories",
    "Movie theaters provide great entertainment",
    
    # Music class (label 1)
    "I love listening to music",
    "Jazz music is relaxing and smooth",
    "Rock concerts are energetic events",
    "Classical music has beautiful melodies",
    "Playing guitar is my favorite hobby",
    "Music festivals bring people together",
    "Pop songs are catchy and fun",
    "The orchestra performed beautifully",
    
    # Science class (label 2)
    "Physics explains how the universe works",
    "Chemistry experiments are fascinating",
    "Biology studies living organisms",
    "Mathematics is the language of science",
    "Scientists conduct research experiments",
    "The laboratory has advanced equipment",
    "Scientific discoveries change the world",
    "Astronomy explores distant planets",
]

In [4]:
# Labels: 0 = Cinema, 1 = Music, 2 = Science
labels = [0, 0, 0, 0, 0, 0, 0, 0,  # 8 Cinema documents
          1, 1, 1, 1, 1, 1, 1, 1,  # 8 Music documents
          2, 2, 2, 2, 2, 2, 2, 2]  # 8 Science documents
num_classes = len(set(labels))
print(f"Number of classes: {num_classes}")
print(f"Number of documents: {len(docs)}")
print(f"Class distribution: Cinema={labels.count(0)}, Music={labels.count(1)}, Science={labels.count(2)}")

Number of classes: 3
Number of documents: 24
Class distribution: Cinema=8, Music=8, Science=8


In [5]:
def tokenize(text):
    return re.findall(r"\w+", text.lower()) #find all words in text

tokenize("I like to watch movies with my friends")

['i', 'like', 'to', 'watch', 'movies', 'with', 'my', 'friends']

In [6]:
def get_vocabulary(texts):
    tokens = {token for text in texts for token in tokenize(text)}
    print(tokens)
    return {word: idx for idx, word in enumerate(sorted(tokens))}
vocabulary = get_vocabulary(docs)
vocabulary

{'of', 'orchestra', 'universe', 'guitar', 'listening', 'beautiful', 'entertainment', 'film', 'biology', 'mathematics', 'hobby', 'everyone', 'theaters', 'distant', 'favorite', 'classical', 'events', 'rock', 'the', 'smooth', 'are', 'advanced', 'with', 'and', 'movie', 'provide', 'laboratory', 'i', 'great', 'world', 'conduct', 'action', 'playing', 'energetic', 'scientists', 'explores', 'concerts', 'fascinating', 'amazing', 'beautifully', 'songs', 'works', 'organisms', 'watching', 'for', 'together', 'how', 'discoveries', 'real', 'jazz', 'music', 'planets', 'tell', 'living', 'bring', 'festivals', 'fun', 'movies', 'showcase', 'studies', 'scientific', 'melodies', 'astronomy', 'experience', 'documentary', 'stories', 'has', 'exciting', 'equipment', 'my', 'watch', 'performed', 'change', 'people', 'experiments', 'cinema', 'explains', 'relaxing', 'science', 'friends', 'physics', 'love', 'to', 'pop', 'language', 'like', 'research', 'catchy', 'is', 'chemistry', 'films'}


{'action': 0,
 'advanced': 1,
 'amazing': 2,
 'and': 3,
 'are': 4,
 'astronomy': 5,
 'beautiful': 6,
 'beautifully': 7,
 'biology': 8,
 'bring': 9,
 'catchy': 10,
 'change': 11,
 'chemistry': 12,
 'cinema': 13,
 'classical': 14,
 'concerts': 15,
 'conduct': 16,
 'discoveries': 17,
 'distant': 18,
 'documentary': 19,
 'energetic': 20,
 'entertainment': 21,
 'equipment': 22,
 'events': 23,
 'everyone': 24,
 'exciting': 25,
 'experience': 26,
 'experiments': 27,
 'explains': 28,
 'explores': 29,
 'fascinating': 30,
 'favorite': 31,
 'festivals': 32,
 'film': 33,
 'films': 34,
 'for': 35,
 'friends': 36,
 'fun': 37,
 'great': 38,
 'guitar': 39,
 'has': 40,
 'hobby': 41,
 'how': 42,
 'i': 43,
 'is': 44,
 'jazz': 45,
 'laboratory': 46,
 'language': 47,
 'like': 48,
 'listening': 49,
 'living': 50,
 'love': 51,
 'mathematics': 52,
 'melodies': 53,
 'movie': 54,
 'movies': 55,
 'music': 56,
 'my': 57,
 'of': 58,
 'orchestra': 59,
 'organisms': 60,
 'people': 61,
 'performed': 62,
 'physics': 6

In [7]:
# Convert document in to feature vector
def doc_to_bow(doc, vocabulary):
    tokens = set(tokenize(doc))
    bow = [0] * len(vocabulary)
    for token in tokens:
        if token in vocabulary:
            bow[vocabulary[token]] = 1
    return bow
doc_to_bow("I like to watch movies with my friends", vocabulary)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0]

In [8]:
vectors = torch.tensor([doc_to_bow(doc, vocabulary) for doc in docs], dtype=torch.float32)
print(f"Labels: {labels}")
labels_tensor = torch.tensor(labels, dtype=torch.long)  # Already 0-indexed
print(f"Vector shape: {vectors.shape}")
print(f"Labels tensor: {labels_tensor}")
print(f"Number of unique labels: {len(torch.unique(labels_tensor))}")

Labels: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]
Vector shape: torch.Size([24, 91])
Labels tensor: tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2])
Number of unique labels: 3


In [9]:
input_dim = len(vocabulary)
hidden_dim = 100
output_dim = num_classes

model = nn.Sequential(
    nn.Linear(input_dim, hidden_dim),
    nn.ReLU(),
    nn.Linear(hidden_dim, output_dim),
    nn.Softmax(dim=1)
)

In [10]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleClassifier(input_dim, hidden_dim, output_dim)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    outputs = model(vectors)
    loss = criterion(outputs, labels_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        with torch.inference_mode():
            predictions = torch.argmax(outputs, dim=1)
            accuracy = (predictions == labels_tensor).float().mean()
            print(f"Step {epoch + 1}, Loss: {loss.item():.4f}, Accuracy: {accuracy.item():.4f}")

print(f"\nFinal training loss: {loss.item():.4f}")
with torch.inference_mode():
    final_predictions = torch.argmax(model(vectors), dim=1)
    final_accuracy = (final_predictions == labels_tensor).float().mean()
    print(f"Final training accuracy: {final_accuracy.item():.4f}")

Step 20, Loss: 1.0915, Accuracy: 0.4583
Step 40, Loss: 1.0814, Accuracy: 0.5000
Step 60, Loss: 1.0712, Accuracy: 0.5833
Step 80, Loss: 1.0611, Accuracy: 0.6667
Step 100, Loss: 1.0508, Accuracy: 0.7083

Final training loss: 1.0508
Final training accuracy: 0.7500


In [13]:
docs = ["I like to watch good movies"]
example_vector = torch.tensor([doc_to_bow(doc, vocabulary) for doc in docs], dtype=torch.float32)
example_output = model(example_vector)
print(example_output)
example_prediction = torch.argmax(example_output, dim=1)
print(f"Example prediction: {example_prediction}")


tensor([[ 0.0053, -0.0430, -0.0194]], grad_fn=<AddmmBackward0>)
Example prediction: tensor([0])
