In [2]:
!pip install nltk



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [40]:
document = """ As we descend from the sunlit surface into the abyss, the environment undergoes radical transformations. The ocean is layered like a subterranean cake, with each stratum hosting its own unique ecosystem.

The Epipelagic Zone (Sunlight Zone): Stretching from the surface down to about 200 meters, this is the zone we are most familiar with. It is flooded with sunlight, allowing photosynthesis to occur. Here, phytoplankton form the base of the global marine food web, supporting everything from tiny krill to massive blue whales.

The Mesopelagic Zone (Twilight Zone): Extending from 200 to 1,000 meters, this zone receives only a faint, ethereal glow of sunlight, enough to see by, but not enough to sustain photosynthesis. Animals here rely heavily on marine snow, a continuous shower of organic detritus falling from the productive waters above. This zone is famous for its vertical migrators; billions of creatures travel to the surface at night to feed and retreat to the safety of the twilight zone during the day, making it the largest mass migration on Earth.

The Bathypelagic Zone (Midnight Zone): From 1,000 to 4,000 meters, we enter a world devoid of solar light. The only illumination here comes from bioluminescence, light produced by the animals themselves. The pressure is immense, exceeding 5,800 pounds per square inch. Food is incredibly scarce.

The Abyssopelagic Zone (The Abyss): Stretching from 4,000 to 6,000 meters, the abyssal zone covers the vast, flat plains of the ocean floor. The water is near freezing, and the pressure is bone-crushing. Yet, life persists in the form of deep-sea cucumbers, brittle stars, and scavenging amphipods.

The Hadalpelagic Zone (The Trenches): The deepest parts of the ocean, extending from 6,000 meters to the very bottom of the Mariana Trench at nearly 11,000 meters. Named after Hades, the Greek god of the underworld, this zone is confined to deep tectonic trenches.

The sea, once it casts its spell, holds one in its net of wonder forever. Jacques Yves Cousteau

Bioluminescence: The Living Light

In the pitch-black void of the bathypelagic and abyssopelagic zones, vision would seem to be a useless sense. Yet, many deep-sea creatures possess highly developed, incredibly sensitive eyes. This is because the deep ocean is not entirely dark; it is illuminated by the flashes, pulses, and steady glows of bioluminescence.

Bioluminescence is the biochemical emission of light by living organisms. In the deep sea, it is estimated that up to ninety percent of the animals possess this remarkable ability. It serves a multitude of vital functions in an environment where finding a mate, securing a meal, or avoiding a predator is overwhelmingly difficult.

Some creatures use bioluminescence as a lure. The iconic deep-sea anglerfish dangles a glowing appendage, an esca filled with bioluminescent bacteria, in front of its massive, toothy maw to attract unwary prey. Others use it for defense. The vampire squid, when threatened, ejects a cloud of glowing, sticky mucus to confuse predators and facilitate an escape. Certain species of shrimp can vomit light to blind their attackers temporarily. Furthermore, many deep-sea fish possess photophores (light-producing organs) on their bellies. By matching the faint downwelling light from the surface, they camouflage their silhouettes from predators lurking below, a sophisticated technique known as counter-illumination.
"""


In [41]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [42]:
# tokenize
tokens = word_tokenize(document.lower())

In [44]:
# build vocab
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'as': 1,
 'we': 2,
 'descend': 3,
 'from': 4,
 'the': 5,
 'sunlit': 6,
 'surface': 7,
 'into': 8,
 'abyss': 9,
 ',': 10,
 'environment': 11,
 'undergoes': 12,
 'radical': 13,
 'transformations': 14,
 '.': 15,
 'ocean': 16,
 'is': 17,
 'layered': 18,
 'like': 19,
 'a': 20,
 'subterranean': 21,
 'cake': 22,
 'with': 23,
 'each': 24,
 'stratum': 25,
 'hosting': 26,
 'its': 27,
 'own': 28,
 'unique': 29,
 'ecosystem': 30,
 'epipelagic': 31,
 'zone': 32,
 '(': 33,
 'sunlight': 34,
 ')': 35,
 ':': 36,
 'stretching': 37,
 'down': 38,
 'to': 39,
 'about': 40,
 '200': 41,
 'meters': 42,
 'this': 43,
 'are': 44,
 'most': 45,
 'familiar': 46,
 'it': 47,
 'flooded': 48,
 'allowing': 49,
 'photosynthesis': 50,
 'occur': 51,
 'here': 52,
 'phytoplankton': 53,
 'form': 54,
 'base': 55,
 'of': 56,
 'global': 57,
 'marine': 58,
 'food': 59,
 'web': 60,
 'supporting': 61,
 'everything': 62,
 'tiny': 63,
 'krill': 64,
 'massive': 65,
 'blue': 66,
 'whales': 67,
 'mesopelagic': 68,
 'twiligh

In [45]:
len(vocab)

300

In [48]:
input_sentences = document.split('\n')

In [49]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence


In [50]:
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))


In [51]:
len(input_numerical_sentences)

22

In [52]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [53]:
len(training_sequence)

618

In [54]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [55]:
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

120

In [56]:
training_sequence[0]

[1, 2]

In [57]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [58]:
len(padded_training_sequence[10])

120

In [59]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [60]:
padded_training_sequence

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   2,   3,   4],
        ...,
        [  0,   0, 242,  ..., 297, 298,   1],
        [  0, 242, 102,  ..., 298,   1, 299],
        [242, 102, 243,  ...,   1, 299,  15]])

In [61]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [62]:
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        ...,
        [  0,   0, 242,  ..., 296, 297, 298],
        [  0, 242, 102,  ..., 297, 298,   1],
        [242, 102, 243,  ..., 298,   1, 299]])

In [63]:
y

tensor([  2,   3,   4,   5,   6,   7,   8,   5,   9,  10,   5,  11,  12,  13,
         14,  15,   5,  16,  17,  18,  19,  20,  21,  22,  10,  23,  24,  25,
         26,  27,  28,  29,  30,  15,  31,  32,  33,  34,  32,  35,  36,  37,
          4,   5,   7,  38,  39,  40,  41,  42,  10,  43,  17,   5,  32,   2,
         44,  45,  46,  23,  15,  47,  17,  48,  23,  34,  10,  49,  50,  39,
         51,  15,  52,  10,  53,  54,   5,  55,  56,   5,  57,  58,  59,  60,
         10,  61,  62,   4,  63,  64,  39,  65,  66,  67,  15,  68,  32,  33,
         69,  32,  35,  36,  70,   4,  41,  39,  71,  42,  10,  43,  32,  72,
         73,  20,  74,  10,  75,  76,  56,  34,  10,  77,  39,  78,  79,  10,
         80,  81,  77,  39,  82,  50,  15,  83,  52,  84,  85,  86,  58,  87,
         10,  20,  88,  89,  56,  90,  91,  92,   4,   5,  93,  94,  95,  15,
         43,  32,  17,  96,  97,  27,  98,  99, 100, 101,  56, 102, 103,  39,
          5,   7, 104, 105,  39, 106, 107, 108,  39,   5, 109,  

In [64]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [65]:
dataset = CustomDataset(X,y)

In [66]:
len(dataset)

618

In [67]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [68]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [69]:
model = LSTMModel(len(vocab))

In [70]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [71]:
model.to(device)

LSTMModel(
  (embedding): Embedding(300, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=300, bias=True)
)

In [72]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [73]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 113.5282
Epoch: 2, Loss: 105.8587
Epoch: 3, Loss: 97.2871
Epoch: 4, Loss: 90.2559
Epoch: 5, Loss: 83.9738
Epoch: 6, Loss: 77.9954
Epoch: 7, Loss: 71.6400
Epoch: 8, Loss: 64.3951
Epoch: 9, Loss: 58.8312
Epoch: 10, Loss: 52.0456
Epoch: 11, Loss: 47.0552
Epoch: 12, Loss: 41.9740
Epoch: 13, Loss: 36.3755
Epoch: 14, Loss: 32.5346
Epoch: 15, Loss: 28.0110
Epoch: 16, Loss: 24.4513
Epoch: 17, Loss: 21.3131
Epoch: 18, Loss: 19.0059
Epoch: 19, Loss: 16.7344
Epoch: 20, Loss: 14.4166
Epoch: 21, Loss: 12.7407
Epoch: 22, Loss: 11.1720
Epoch: 23, Loss: 10.0528
Epoch: 24, Loss: 9.1421
Epoch: 25, Loss: 8.0766
Epoch: 26, Loss: 7.3309
Epoch: 27, Loss: 6.6140
Epoch: 28, Loss: 6.0949
Epoch: 29, Loss: 5.6143
Epoch: 30, Loss: 5.2002
Epoch: 31, Loss: 4.7441
Epoch: 32, Loss: 4.4365
Epoch: 33, Loss: 4.1377
Epoch: 34, Loss: 3.8792
Epoch: 35, Loss: 3.6525
Epoch: 36, Loss: 3.4983
Epoch: 37, Loss: 3.2166
Epoch: 38, Loss: 3.1637
Epoch: 39, Loss: 2.8367
Epoch: 40, Loss: 2.8231
Epoch: 41, Loss: 2.6314


In [74]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_indices(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]



In [75]:
prediction(model, vocab, "The course follows a monthly")

'The course follows a monthly zone'

In [76]:
import time

num_tokens = 10
input_text = "hi how are"

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)


hi how are we
hi how are we descend
hi how are we descend from
hi how are we descend from the
hi how are we descend from the sunlit
hi how are we descend from the sunlit surface
hi how are we descend from the sunlit surface into
hi how are we descend from the sunlit surface into the
hi how are we descend from the sunlit surface into the abyss
hi how are we descend from the sunlit surface into the abyss ,


In [77]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [78]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 99.03%
