In [3]:
import os
import json
import random
import numpy as np
import config
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import sentence_bleu

config.training_label = "/kaggle/input/data-k"
config.max_length = 15
config.epochs = 50
config.batch_size = 64
config.num_decoder_tokens = 6000
# config.num_decoder_tokens = 1500
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
class VideoDataset():
    def __init__(self):
        self.x_data = {}
        self.encoder = EncoderModel().to(device)  
        self.decoder = DecoderModel().to(device)  
        self.lr = config.learning_rate
        self.epochs = config.epochs
        self.save_model_path = config.save_model_path
        self.patience = 3  
        self.min_delta = 0.001
    def preprocessing(self):
  
        TRAIN_LABEL_PATH = os.path.join(config.training_label, '/kaggle/input/data-k/data (2).json')
        with open(TRAIN_LABEL_PATH) as data_file:
            y_data = json.load(data_file)
        
        train_list = []
        vocab_list = []

        for i,y in enumerate(y_data):
            for caption in y_data[f"{i}"]['caption']:
                caption = "<bos> " + caption + " <eos>"
                # if len(caption.split()) <= 15:
                train_list.append([caption, y_data[f"{i}"]['id']])

        random.shuffle(train_list)
        training_list = train_list[int(len(train_list) * config.validation_split):]
        validation_list = train_list[:int(len(train_list) * config.validation_split)]

        for train in training_list:
            vocab_list.append(train[0])
        self.tokenizer = Tokenizer(num_words=config.num_decoder_tokens)
        self.tokenizer.fit_on_texts(vocab_list)

        TRAIN_FEATURE_DIR = os.path.join(config.train_feet, 'feat')
        for filename in os.listdir(TRAIN_FEATURE_DIR):
            f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename), allow_pickle=True)
            self.x_data[filename[:-4]] = f

        return training_list,self.x_data,validation_list

    def load_dataset(self, training_list):

        encoder_input_data = []
        decoder_input_data = []
        decoder_target_data = []
        videoId = []
        videoSeq = []

        for idx, cap in enumerate(training_list): 
            caption = cap[0]
            videoId.append(cap[1])
            videoSeq.append(caption)
        
        train_sequences = self.tokenizer.texts_to_sequences(videoSeq)
        train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=config.max_length)
        train_sequences = np.array(train_sequences)
#         print(train_sequences)
        file_size = len(train_sequences)
        n = 0
    
        for idx in range(0, file_size):
            n += 1
            
            encoder_input_data.append(self.x_data[videoId[idx]])
            y = to_categorical(train_sequences[idx], config.num_decoder_tokens)
            decoder_input_data.append(y[:-1])
            decoder_target_data.append(y[1:])

            if n == config.batch_size:
                encoder_input_n = np.array(encoder_input_data)
                decoder_input_n = np.array(decoder_input_data)
                decoder_target_n = np.array(decoder_target_data)

                # Convert data to PyTorch tensors
                encoder_input = torch.tensor(encoder_input_n, dtype=torch.float32).to(device)
                decoder_input = torch.tensor(decoder_input_n, dtype=torch.float32).to(device)
                decoder_target = torch.tensor(decoder_target_n, dtype=torch.float32).to(device)

                encoder_input_data = []
                decoder_input_data = []
                decoder_target_data = []
                n = 0

                yield ([encoder_input, decoder_input], decoder_target)


    def train(self):

        training_list, x_data,validation_list = self.preprocessing()

        optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=self.lr)
        criterion = nn.CrossEntropyLoss()

        training_losses = []
        validation_losses = []
        bleu_scores = []
        best_val_loss = float('inf')
        patience_counter = 0
        run_epoch=0

        for epoch in range(self.epochs):
            self.encoder.train()
            self.decoder.train()
            train_loader = self.load_dataset(training_list)
            total_train_loss = 0
            train_steps = 0
            run_epoch = run_epoch+1

            for [encoder_input, decoder_input], decoder_target in train_loader:
                decoder_target = torch.argmax(decoder_target, dim=-1)

                encoder_state = self.encoder(encoder_input)
                encoder_outputs, _ = self.encoder.encoder(encoder_input)  # Get encoder outputs for attention
                decoder_output, attention_weights = self.decoder(decoder_input, encoder_state, encoder_outputs)

      
                loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
                total_train_loss += loss.item()

                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                train_steps += 1 
                if train_steps % 500 == 0:
                    print(f"Epoch {epoch + 1}, Step {train_steps}: Training Loss = {total_train_loss / train_steps}")

            average_train_loss = total_train_loss / train_steps 
            
            training_losses.append(average_train_loss)

            self.encoder.eval()
            self.decoder.eval()
            total_val_loss = 0
            val_steps = 0
            total_sequences = 0
            total_bleu = 0 
            with torch.no_grad():  
                valid_loader = self.load_dataset(validation_list)
                for [encoder_input, decoder_input], decoder_target in valid_loader:
                    decoder_target = torch.argmax(decoder_target, dim=-1)

                    encoder_state = self.encoder(encoder_input)
                    encoder_outputs, _ = self.encoder.encoder(encoder_input)  # Get encoder outputs for attention
                    decoder_output, attention_weights = self.decoder(decoder_input, encoder_state, encoder_outputs)

                    val_loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
                    total_val_loss += val_loss.item()
                    predicted_sequences = torch.argmax(decoder_output, dim=-1)
                    reference_sequences = decoder_target
                    for i in range(len(predicted_sequences)):
                        total_sequences += 1
                        predicted_caption = [self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in predicted_sequences[i] if idx > 2]
                        reference_caption = [[self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in reference_sequences[i] if idx > 2]]

                        
                        bleu_score = sentence_bleu(reference_caption, predicted_caption)
                        total_bleu += bleu_score

                    val_steps += 1  

                average_bleu = total_bleu / total_sequences
                bleu_scores.append(average_bleu)
                average_val_loss = total_val_loss / val_steps 
                validation_losses.append(average_val_loss)# Calculate average validation loss
                print(f'Epoch {epoch + 1}/{self.epochs}, Training Loss: {average_train_loss}, Validation Loss: {average_val_loss}, BLEU Score: {average_bleu}')
                if average_val_loss < best_val_loss - self.min_delta:
                    best_val_loss = average_val_loss
                    patience_counter = 0  # Reset patience counter
                    print(f"Validation loss improved to {best_val_loss}. Saving model.")
                    torch.save(self.encoder.state_dict(), os.path.join(self.save_model_path, 'encoder_model_lstm.pth'))
                    torch.save(self.decoder.state_dict(), os.path.join(self.save_model_path, 'decoder_model_lstm.pth'))
                    with open(os.path.join(self.save_model_path, 'tokenizer_lstm_' + str(config.num_decoder_tokens)), 'wb') as file:
                        joblib.dump(self.tokenizer, file)
                else:
                    patience_counter += 1
                    print(f"No improvement for {patience_counter} epochs.")
                    if patience_counter >= self.patience:
                        print(f"Early stopping triggered after {epoch + 1} epochs.")
                        break

        plt.figure(figsize=(10, 5))
        plt.plot(range(1, run_epoch+1), training_losses, label='Training Loss')
        plt.plot(range(1, run_epoch+1), validation_losses, label='Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss per Epoch')
        plt.legend()
        plt.show()
        plt.plot(range(1, run_epoch+1), bleu_scores, label='bleu_scores')
        plt.show()    
class EncoderModel(nn.Module):
    def __init__(self):
        super(EncoderModel, self).__init__()
        self.encoder = nn.LSTM(input_size=config.num_encoder_tokens, hidden_size=config.latent_dim, batch_first=True)
        
    def forward(self, encoder_inputs):
        encoder_outputs, (state_h, state_c) = self.encoder(encoder_inputs)
        return (state_h, state_c)
        
class Attention(nn.Module):
    def __init__(self, latent_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(latent_dim * 2, latent_dim)
        self.v = nn.Linear(latent_dim, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # Expand decoder hidden state to match encoder outputs' time dimension
        decoder_hidden_exp = decoder_hidden.unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)
        # Concatenate hidden state with encoder outputs
        energy = torch.tanh(self.attn(torch.cat((decoder_hidden_exp, encoder_outputs), dim=2)))
        # Compute attention scores
        attention_scores = self.v(energy).squeeze(2)
        attention_weights = torch.softmax(attention_scores, dim=1)
        # Compute context vector as weighted sum of encoder outputs
        context = torch.sum(attention_weights.unsqueeze(2) * encoder_outputs, dim=1)
        return context, attention_weights


class DecoderModel(nn.Module):
    def __init__(self):
        super(DecoderModel, self).__init__()
        self.decoder = nn.LSTM(input_size=config.num_decoder_tokens, hidden_size=config.latent_dim, batch_first=True)
        self.attention = Attention(config.latent_dim)
        self.concat = nn.Linear(config.latent_dim * 2, config.latent_dim)
        self.decoder_dense = nn.Linear(config.latent_dim, config.num_decoder_tokens)
        
    def forward(self, decoder_inputs, encoder_states, encoder_outputs):
        decoder_outputs, (hidden_state, cell_state) = self.decoder(decoder_inputs, encoder_states)
        all_outputs = []
        attention_weights_list = []

        for t in range(decoder_outputs.size(1)):
            # Select the current decoder hidden state
            decoder_hidden = hidden_state[-1]  # Take the last layer's hidden state if multi-layered
            # Apply attention mechanism
            context, attention_weights = self.attention(decoder_hidden, encoder_outputs)
            attention_weights_list.append(attention_weights)

            # Concatenate context vector with current decoder hidden state
            combined = torch.cat((decoder_outputs[:, t, :], context), dim=1)
            combined = torch.tanh(self.concat(combined))
            # Predict next token
            output = self.decoder_dense(combined)
            all_outputs.append(output.unsqueeze(1))

        # Concatenate all time steps
        final_outputs = torch.cat(all_outputs, dim=1)
        return final_outputs, attention_weights_list



cuda


In [None]:
dataset = VideoDataset()
dataset.train()

Epoch 1, Step 500: Training Loss = 3.4440316853523254
Epoch 1, Step 1000: Training Loss = 3.0967313017845153
Epoch 1, Step 1500: Training Loss = 2.9095302421251934


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Epoch 1/50, Training Loss: 2.8458278884777446, Validation Loss: 2.383697696591987, BLEU Score: 0.5995071963722498
Validation loss improved to 2.383697696591987. Saving model.
Epoch 2, Step 500: Training Loss = 2.358223240852356
Epoch 2, Step 1000: Training Loss = 2.300657787680626
Epoch 2, Step 1500: Training Loss = 2.2557233063379925
Epoch 2/50, Training Loss: 2.238456957739902, Validation Loss: 2.154239591223295, BLEU Score: 0.603957579938259
Validation loss improved to 2.154239591223295. Saving model.
Epoch 3, Step 500: Training Loss = 2.094635508775711
Epoch 3, Step 1000: Training Loss = 2.061631569504738
Epoch 3, Step 1500: Training Loss = 2.0335416527589163
Epoch 3/50, Training Loss: 2.0223116723788266, Validation Loss: 2.0623339105824954, BLEU Score: 0.5943438098553814
Validation loss improved to 2.0623339105824954. Saving model.
Epoch 4, Step 500: Training Loss = 1.9359647831916809
Epoch 4, Step 1000: Training Loss = 1.9096849132776261
Epoch 4, Step 1500: Training Loss = 1.8878

In [3]:
# import os
# import json
# import random
# import numpy as np
# import config
# from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import joblib
# import matplotlib.pyplot as plt
# import nltk
# from nltk.translate.bleu_score import sentence_bleu

# config.training_label = "/kaggle/input/data-k"
# config.max_length = 15
# config.epochs = 50
# config.batch_size = 64
# config.num_decoder_tokens = 6000
# # config.num_decoder_tokens = 1500
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
# class VideoDataset():
#     def __init__(self):
#         self.x_data = {}
#         self.encoder = EncoderModel().to(device)  
#         self.decoder = TransformerDecoder().to(device)   
#         self.lr = config.learning_rate
#         self.epochs = config.epochs
#         self.save_model_path = config.save_model_path
#         self.patience = 3  
#         self.min_delta = 0.001
#     def preprocessing(self):
  
#         TRAIN_LABEL_PATH = os.path.join(config.training_label, '/kaggle/input/data-k/data (2).json')
#         with open(TRAIN_LABEL_PATH) as data_file:
#             y_data = json.load(data_file)
        
#         train_list = []
#         vocab_list = []

#         for i,y in enumerate(y_data):
#             for caption in y_data[f"{i}"]['caption']:
#                 caption = "<bos> " + caption + " <eos>"
#                 # if len(caption.split()) <= 15:
#                 train_list.append([caption, y_data[f"{i}"]['id']])

#         random.shuffle(train_list)
#         training_list = train_list[int(len(train_list) * config.validation_split):]
#         validation_list = train_list[:int(len(train_list) * config.validation_split)]

#         for train in training_list:
#             vocab_list.append(train[0])
#         self.tokenizer = Tokenizer(num_words=config.num_decoder_tokens)
#         self.tokenizer.fit_on_texts(vocab_list)

#         TRAIN_FEATURE_DIR = os.path.join(config.train_feet, 'feat')
#         for filename in os.listdir(TRAIN_FEATURE_DIR):
#             f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename), allow_pickle=True)
#             self.x_data[filename[:-4]] = f

#         return training_list,self.x_data,validation_list

#     def load_dataset(self, training_list):

#         encoder_input_data = []
#         decoder_input_data = []
#         decoder_target_data = []
#         videoId = []
#         videoSeq = []

#         for idx, cap in enumerate(training_list): 
#             caption = cap[0]
#             videoId.append(cap[1])
#             videoSeq.append(caption)
        
#         train_sequences = self.tokenizer.texts_to_sequences(videoSeq)
#         train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=config.max_length)
#         train_sequences = np.array(train_sequences)
# #         print(train_sequences)
#         file_size = len(train_sequences)
#         n = 0
    
#         for idx in range(0, file_size):
#             n += 1
            
#             encoder_input_data.append(self.x_data[videoId[idx]])
#             y = to_categorical(train_sequences[idx], config.num_decoder_tokens)
#             decoder_input_data.append(y[:-1])
#             decoder_target_data.append(y[1:])

#             if n == config.batch_size:
#                 encoder_input_n = np.array(encoder_input_data)
#                 decoder_input_n = np.array(decoder_input_data)
#                 decoder_target_n = np.array(decoder_target_data)

#                 # Convert data to PyTorch tensors
#                 encoder_input = torch.tensor(encoder_input_n, dtype=torch.float32).to(device)
#                 decoder_input = torch.tensor(decoder_input_n, dtype=torch.float32).to(device)
#                 decoder_target = torch.tensor(decoder_target_n, dtype=torch.float32).to(device)

#                 encoder_input_data = []
#                 decoder_input_data = []
#                 decoder_target_data = []
#                 n = 0

#                 yield ([encoder_input, decoder_input], decoder_target)


#     def train(self):

#         training_list, x_data,validation_list = self.preprocessing()

#         optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=self.lr)
#         criterion = nn.CrossEntropyLoss()

#         training_losses = []
#         validation_losses = []
#         bleu_scores = []
#         best_val_loss = float('inf')
#         patience_counter = 0
#         run_epoch=0

#         for epoch in range(self.epochs):
#             self.encoder.train()
#             self.decoder.train()
#             train_loader = self.load_dataset(training_list)
#             total_train_loss = 0
#             train_steps = 0
#             run_epoch = run_epoch+1

#             for [encoder_input, decoder_input], decoder_target in train_loader:
#                 decoder_target = torch.argmax(decoder_target, dim=-1)

#                 encoder_state = self.encoder(encoder_input)
#                 decoder_output = self.decoder(decoder_input, encoder_state)

      
#                 loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
#                 total_train_loss += loss.item()

#                 loss.backward()
#                 optimizer.step()
#                 optimizer.zero_grad()

#                 train_steps += 1 
#                 if train_steps % 500 == 0:
#                     print(f"Epoch {epoch + 1}, Step {train_steps}: Training Loss = {total_train_loss / train_steps}")

#             average_train_loss = total_train_loss / train_steps 
            
#             training_losses.append(average_train_loss)

#             self.encoder.eval()
#             self.decoder.eval()
#             total_val_loss = 0
#             val_steps = 0
#             total_sequences = 0
#             total_bleu = 0 
#             with torch.no_grad():  
#                 valid_loader = self.load_dataset(validation_list)
#                 for [encoder_input, decoder_input], decoder_target in valid_loader:
#                     decoder_target = torch.argmax(decoder_target, dim=-1)

#                     encoder_state = self.encoder(encoder_input)
#                     decoder_output = self.decoder(decoder_input, encoder_state)
#                     val_loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
#                     total_val_loss += val_loss.item()
#                     predicted_sequences = torch.argmax(decoder_output, dim=-1)
#                     reference_sequences = decoder_target
#                     for i in range(len(predicted_sequences)):
#                         total_sequences += 1
#                         predicted_caption = [self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in predicted_sequences[i] if idx > 2]
#                         reference_caption = [[self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in reference_sequences[i] if idx > 2]]

                        
#                         bleu_score = sentence_bleu(reference_caption, predicted_caption)
#                         total_bleu += bleu_score

#                     val_steps += 1  

#                 average_bleu = total_bleu / total_sequences
#                 bleu_scores.append(average_bleu)
#                 average_val_loss = total_val_loss / val_steps 
#                 validation_losses.append(average_val_loss)# Calculate average validation loss
#                 print(f'Epoch {epoch + 1}/{self.epochs}, Training Loss: {average_train_loss}, Validation Loss: {average_val_loss}, BLEU Score: {average_bleu}')
#                 if average_val_loss < best_val_loss - self.min_delta:
#                     best_val_loss = average_val_loss
#                     patience_counter = 0  # Reset patience counter
#                     print(f"Validation loss improved to {best_val_loss}. Saving model.")
#                     torch.save(self.encoder.state_dict(), os.path.join(self.save_model_path, 'encoder_model_lstm.pth'))
#                     torch.save(self.decoder.state_dict(), os.path.join(self.save_model_path, 'decoder_model_lstm.pth'))
#                     with open(os.path.join(self.save_model_path, 'tokenizer_lstm_' + str(config.num_decoder_tokens)), 'wb') as file:
#                         joblib.dump(self.tokenizer, file)
#                 else:
#                     patience_counter += 1
#                     print(f"No improvement for {patience_counter} epochs.")
#                     if patience_counter >= self.patience:
#                         print(f"Early stopping triggered after {epoch + 1} epochs.")
#                         break

#         plt.figure(figsize=(10, 5))
#         plt.plot(range(1, run_epoch+1), training_losses, label='Training Loss')
#         plt.plot(range(1, run_epoch+1), validation_losses, label='Validation Loss')
#         plt.xlabel('Epochs')
#         plt.ylabel('Loss')
#         plt.title('Training and Validation Loss per Epoch')
#         plt.legend()
#         plt.show()
#         plt.plot(range(1, run_epoch+1), bleu_scores, label='bleu_scores')
#         plt.show()    

# class EncoderModel(nn.Module):
#     def __init__(self):
#         super(EncoderModel, self).__init__()
#         self.encoder = nn.LSTM(input_size=config.num_encoder_tokens, hidden_size=config.latent_dim, batch_first=True)
        
#     def forward(self, encoder_inputs):
#         encoder_outputs, (state_h, state_c) = self.encoder(encoder_inputs)
#         return (state_h, state_c)


# class TransformerDecoder(nn.Module):
#     def __init__(self):
#         super(TransformerDecoder, self).__init__()
#         self.embedding = nn.Embedding(config.num_decoder_tokens, config.latent_dim)
#         self.positional_encoding = nn.Parameter(torch.zeros(1, config.max_length, config.latent_dim))
#         self.transformer_decoder = nn.Transformer(d_model=config.latent_dim, nhead=8, num_encoder_layers=6, num_decoder_layers=6)
#         self.fc_out = nn.Linear(config.latent_dim, config.num_decoder_tokens)

#     def forward(self, decoder_inputs, encoder_states):
#         embedded_inputs = self.embedding(decoder_inputs) + self.positional_encoding[:, :decoder_inputs.size(1), :]
#         transformer_output = self.transformer_decoder(embedded_inputs, encoder_states[0].unsqueeze(0))
#         output = self.fc_out(transformer_output)
#         return output


In [4]:
# import os
# import json
# import random
# import numpy as np
# import config
# from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import joblib
# import matplotlib.pyplot as plt
# import nltk
# from nltk.translate.bleu_score import sentence_bleu

# config.training_label = "/kaggle/input/data-k"
# config.max_length = 15
# config.epochs = 50
# config.batch_size = 64
# config.num_decoder_tokens = 6000
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

# class VideoDataset():
#     def __init__(self):
#         self.x_data = {}
#         self.encoder = EncoderModel().to(device)
#         self.decoder = TransformerDecoder().to(device)  
#         self.lr = config.learning_rate
#         self.epochs = config.epochs
#         self.save_model_path = config.save_model_path
#         self.patience = 3  
#         self.min_delta = 0.001

#     def preprocessing(self):
#         TRAIN_LABEL_PATH = os.path.join(config.training_label, '/kaggle/input/data-k/data (2).json')
#         with open(TRAIN_LABEL_PATH) as data_file:
#             y_data = json.load(data_file)
        
#         train_list = []
#         vocab_list = []

#         for i, y in enumerate(y_data):
#             for caption in y_data[f"{i}"]['caption']:
#                 caption = "<bos> " + caption + " <eos>"
#                 train_list.append([caption, y_data[f"{i}"]['id']])

#         random.shuffle(train_list)
#         training_list = train_list[int(len(train_list) * config.validation_split):]
#         validation_list = train_list[:int(len(train_list) * config.validation_split)]

#         for train in training_list:
#             vocab_list.append(train[0])
#         self.tokenizer = Tokenizer(num_words=config.num_decoder_tokens)
#         self.tokenizer.fit_on_texts(vocab_list)

#         TRAIN_FEATURE_DIR = os.path.join(config.train_feet, 'feat')
#         for filename in os.listdir(TRAIN_FEATURE_DIR):
#             f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename), allow_pickle=True)
#             self.x_data[filename[:-4]] = f

#         return training_list, self.x_data, validation_list

#     def load_dataset(self, training_list):
#         encoder_input_data = []
#         decoder_input_data = []
#         decoder_target_data = []
#         videoId = []
#         videoSeq = []

#         for idx, cap in enumerate(training_list):
#             caption = cap[0]
#             videoId.append(cap[1])
#             videoSeq.append(caption)
        
#         train_sequences = self.tokenizer.texts_to_sequences(videoSeq)
#         train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=config.max_length)
#         train_sequences = np.array(train_sequences)

#         file_size = len(train_sequences)
#         n = 0
    
#         for idx in range(0, file_size):
#             n += 1
            
#             encoder_input_data.append(self.x_data[videoId[idx]])
#             y = to_categorical(train_sequences[idx], config.num_decoder_tokens)
#             decoder_input_data.append(y[:-1])
#             decoder_target_data.append(y[1:])

#             if n == config.batch_size:
#                 encoder_input_n = np.array(encoder_input_data)
#                 decoder_input_n = np.array(decoder_input_data)
#                 decoder_target_n = np.array(decoder_target_data)

#                 encoder_input = torch.tensor(encoder_input_n, dtype=torch.float32).to(device)
#                 decoder_input = torch.tensor(decoder_input_n, dtype=torch.float32).to(device)
#                 decoder_target = torch.tensor(decoder_target_n, dtype=torch.float32).to(device)

#                 encoder_input_data = []
#                 decoder_input_data = []
#                 decoder_target_data = []
#                 n = 0

#                 yield ([encoder_input, decoder_input], decoder_target)

#     def train(self):
#         training_list, x_data, validation_list = self.preprocessing()

#         optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=self.lr)
#         criterion = nn.CrossEntropyLoss()

#         training_losses = []
#         validation_losses = []
#         bleu_scores = []
#         best_val_loss = float('inf')
#         patience_counter = 0
#         run_epoch = 0

#         for epoch in range(self.epochs):
#             self.encoder.train()
#             self.decoder.train()
#             train_loader = self.load_dataset(training_list)
#             total_train_loss = 0
#             train_steps = 0
#             run_epoch += 1

#             for [encoder_input, decoder_input], decoder_target in train_loader:
#                 decoder_target = torch.argmax(decoder_target, dim=-1)
                
#                 encoder_state = self.encoder(encoder_input)
#                 decoder_output = self.decoder(decoder_input, encoder_state)
#                 print(decoder_output.shape)
#                 print(decoder_target.shape)
#                 loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
#                 total_train_loss += loss.item()

#                 loss.backward()
#                 optimizer.step()
#                 optimizer.zero_grad()

#                 train_steps += 1 
#                 if train_steps % 500 == 0:
#                     print(f"Epoch {epoch + 1}, Step {train_steps}: Training Loss = {total_train_loss / train_steps}")

#             average_train_loss = total_train_loss / train_steps
#             training_losses.append(average_train_loss)

#             # Validation loop...
#             # (similar to training loop but without `loss.backward()`)

#         # Plotting loss and BLEU scores
#         plt.figure(figsize=(10, 5))
#         plt.plot(range(1, run_epoch+1), training_losses, label='Training Loss')
#         plt.xlabel('Epochs')
#         plt.ylabel('Loss')
#         plt.title('Training and Validation Loss per Epoch')
#         plt.legend()
#         plt.show()

# class EncoderModel(nn.Module):
#     def __init__(self):
#         super(EncoderModel, self).__init__()
#         self.encoder = nn.LSTM(input_size=config.num_encoder_tokens, hidden_size=config.latent_dim, batch_first=True)
        
#     def forward(self, encoder_inputs):
#         encoder_outputs, (state_h, state_c) = self.encoder(encoder_inputs)
#         return (state_h, state_c)

# class TransformerDecoder(nn.Module):
#     def __init__(self):
#         super(TransformerDecoder, self).__init__()
#         self.embedding = nn.Embedding(config.num_decoder_tokens, 512)  # Embedding layer with latent_dim
#         self.positional_encoding = nn.Parameter(torch.zeros(64, config.max_length-1, config.latent_dim))  # Adjusting to latent_dim size
#         self.transformer_decoder = nn.Transformer(d_model=config.latent_dim, nhead=8, num_encoder_layers=6, num_decoder_layers=6)
#         self.fc_out = nn.Linear(config.latent_dim, config.num_decoder_tokens)

#     def forward(self, decoder_inputs, encoder_states):
#         # Convert decoder inputs to Long type for embedding lookup
#         decoder_inputs = torch.argmax(decoder_inputs, dim=-1)
#         embedded_inputs = self.embedding(decoder_inputs.long())
#         print(embedded_inputs.shape)
#         # Adjust positional encoding to match the input sequence length
#         position_encoding = self.positional_encoding
#         print(position_encoding.shape)
#         # Adding embeddings and positional encodings
#         embedded_inputs = embedded_inputs + position_encoding
#         print(embedded_inputs.shape)
#         # Permute encoder_states to (seq_len, batch, latent_dim) for nn.Transformer
#         encoder_states = encoder_states[0]
#         print(encoder_states.shape)
#         # Forward pass through Transformerecoder
#         transformer_output = self.transformer_decoder(embedded_inputs.permute(1, 0, 2), encoder_states)
        
#         # Permute transforme dr_output back to (batch, seq_len, latent_dim) and apply final linear layer
#         output = self.fc_out(transformer_output.permute(1, 0, 2))
#         return output



cuda


In [None]:
# dataset = VideoDataset()
# # dataset.train()

Epoch 1, Step 500: Training Loss = 3.4462828130722047
Epoch 1, Step 1000: Training Loss = 3.1145551958084106


In [None]:
# import os
# import json
# import random
# import numpy as np
# import config
# from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import joblib
# import matplotlib.pyplot as plt
# import nltk
# from nltk.translate.bleu_score import sentence_bleu



# config.training_label = "/kaggle/input/data-k"
# config.max_length = 15
# config.epochs = 50
# config.batch_size = 64
# config.num_decoder_tokens = 6000
# # config.num_decoder_tokens = 1500
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
# class VideoDataset():
#     def __init__(self):
#         self.x_data = {}
#         self.encoder = EncoderModel().to(device)  
#         self.decoder = DecoderModel().to(device)  
#         self.lr = config.learning_rate
#         self.epochs = config.epochs
#         self.save_model_path = config.save_model_path
#         self.patience = 3  
#         self.min_delta = 0.001
#     def preprocessing(self):
#         TRAIN_LABEL_PATH = os.path.join(config.training_label, '/kaggle/input/data-k/data (2).json')
#         with open(TRAIN_LABEL_PATH) as data_file:
#             y_data = json.load(data_file)
        
#         train_list = []
#         vocab_list = []

#         for i,y in enumerate(y_data):
#             for caption in y_data[f"{i}"]['caption']:
#                 caption = "<bos> " + caption + " <eos>"
#                 # if len(caption.split()):
#                 train_list.append([caption, y_data[f"{i}"]['id']])

#         random.shuffle(train_list)
#         training_list = train_list[int(len(train_list) * config.validation_split):]
#         validation_list = train_list[:int(len(train_list) * config.validation_split)]

#         for train in training_list:
#             vocab_list.append(train[0])
#         self.tokenizer = Tokenizer(num_words=config.num_decoder_tokens)
#         self.tokenizer.fit_on_texts(vocab_list)

#         TRAIN_FEATURE_DIR = os.path.join(config.train_feet, 'feat')
#         for filename in os.listdir(TRAIN_FEATURE_DIR):
#             f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename), allow_pickle=True)
#             self.x_data[filename[:-4]] = f

#         return training_list,self.x_data,validation_list

#     def load_dataset(self, training_list):

#         encoder_input_data = []
#         decoder_input_data = []
#         decoder_target_data = []
#         videoId = []
#         videoSeq = []
    
#         for idx, cap in enumerate(training_list): 
#             caption = cap[0]
#             videoId.append(cap[1])
#             videoSeq.append(caption)
        
#         train_sequences = self.tokenizer.texts_to_sequences(videoSeq)
#         train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=config.max_length)
#         train_sequences = np.array(train_sequences)
        
#         file_size = len(train_sequences)
#         n = 0
    
#         for idx in range(0, file_size):
#             n += 1
            
#             encoder_input_data.append(self.x_data[videoId[idx]])
#             y = to_categorical(train_sequences[idx], config.num_decoder_tokens)
#             decoder_input_data.append(y[:-1])
#             decoder_target_data.append(y[1:])

#             if n == config.batch_size:
#                 encoder_input_n = np.array(encoder_input_data)
#                 decoder_input_n = np.array(decoder_input_data)
#                 decoder_target_n = np.array(decoder_target_data)

#                 # Convert data to PyTorch tensors
#                 encoder_input = torch.tensor(encoder_input_n, dtype=torch.float32).to(device)
#                 decoder_input = torch.tensor(decoder_input_n, dtype=torch.float32).to(device)
#                 decoder_target = torch.tensor(decoder_target_n, dtype=torch.float32).to(device)

#                 encoder_input_data = []
#                 decoder_input_data = []
#                 decoder_target_data = []
#                 n = 0

#                 yield ([encoder_input, decoder_input], decoder_target)


#     def train(self):

#         training_list, x_data,validation_list = self.preprocessing()

#         optimizer = optim.Adam(list(self.encoder.parameters()) + list(self.decoder.parameters()), lr=self.lr)
#         criterion = nn.CrossEntropyLoss()

#         training_losses = []
#         validation_losses = []
#         bleu_scores = []
#         best_val_loss = float('inf')
#         patience_counter = 0
#         run_epoch=0

#         for epoch in range(self.epochs):
#             # Training loop
#             self.encoder.train()
#             self.decoder.train()
#             train_loader = self.load_dataset(training_list)
#             total_train_loss = 0
#             train_steps = 0
#             run_epoch = run_epoch+1

#             for [encoder_input, decoder_input], decoder_target in train_loader:

#                 decoder_target = torch.argmax(decoder_target, dim=-1) 

#                 encoder_state = self.encoder(encoder_input)
#                 decoder_output = self.decoder(decoder_input, encoder_state)

#                 loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
#                 total_train_loss += loss.item()
#                 loss.backward()
#                 optimizer.step()
#                 optimizer.zero_grad()

#                 train_steps += 1 
#                 if train_steps % 500 == 0:
#                     print(f"Epoch {epoch + 1}, Step {train_steps}: Training Loss = {total_train_loss / train_steps}")

#             average_train_loss = total_train_loss / train_steps 
#             training_losses.append(average_train_loss)

#             self.encoder.eval()
#             self.decoder.eval()
#             total_val_loss = 0
#             val_steps = 0
#             total_sequences = 0
#             total_bleu = 0 
#             with torch.no_grad(): 
#                 valid_loader = self.load_dataset(validation_list)
#                 for [encoder_input, decoder_input], decoder_target in valid_loader:
#                     decoder_target = torch.argmax(decoder_target, dim=-1)
#                     encoder_state = self.encoder(encoder_input)
#                     decoder_output = self.decoder(decoder_input, encoder_state)
#                     val_loss = criterion(decoder_output.view(-1, config.num_decoder_tokens), decoder_target.view(-1))
#                     total_val_loss += val_loss.item()
#                     predicted_sequences = torch.argmax(decoder_output, dim=-1)
#                     reference_sequences = decoder_target
#                     for i in range(len(predicted_sequences)):
#                         total_sequences += 1
#                         predicted_caption = [self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in predicted_sequences[i] if idx > 2]
#                         reference_caption = [[self.tokenizer.index_word.get(idx.item(), '<unk>') for idx in reference_sequences[i] if idx > 2]]

                
#                         bleu_score = sentence_bleu(reference_caption, predicted_caption)
#                         total_bleu += bleu_score

#                     val_steps += 1  
                    
#                 average_bleu = total_bleu / total_sequences
#                 bleu_scores.append(average_bleu)
#                 average_val_loss = total_val_loss / val_steps 
#                 validation_losses.append(average_val_loss)# Calculate average validation loss
#                 print(f'Epoch {epoch + 1}/{self.epochs}, Training Loss: {average_train_loss}, Validation Loss: {average_val_loss}, BLEU Score: {average_bleu}')
#                 if average_val_loss < best_val_loss - self.min_delta:
#                     best_val_loss = average_val_loss
#                     patience_counter = 0  # Reset patience counter
#                     print(f"Validation loss improved to {best_val_loss}. Saving model.")
#                     torch.save(self.encoder.state_dict(), os.path.join(self.save_model_path, 'encoder_model_gru.pth'))
#                     torch.save(self.decoder.state_dict(), os.path.join(self.save_model_path, 'decoder_model_gru.pth'))
#                     with open(os.path.join(self.save_model_path, 'tokenizer_gru_' + str(config.num_decoder_tokens)), 'wb') as file:
#                         joblib.dump(self.tokenizer, file)
#                 else:
#                     patience_counter += 1
#                     print(f"No improvement for {patience_counter} epochs.")
#                     if patience_counter >= self.patience:
#                         print(f"Early stopping triggered after {epoch + 1} epochs.")
#                         break
                        
#         plt.figure(figsize=(10, 5))
#         plt.plot(range(1, run_epoch+1), training_losses, label='Training Loss')
#         plt.plot(range(1, run_epoch+1), validation_losses, label='Validation Loss')
#         plt.xlabel('Epochs')
#         plt.ylabel('Loss')
#         plt.title('Training and Validation Loss per Epoch')
#         plt.legend()
#         plt.show()
#         plt.plot(range(1, run_epoch+1), bleu_scores, label='bleu_scores')
#         plt.show()    

# class EncoderModel(nn.Module):
#     def __init__(self):
#         super(EncoderModel, self).__init__()

#         self.encoder = nn.GRU(input_size=config.num_encoder_tokens, hidden_size=config.latent_dim, batch_first=True)
        
#     def forward(self, encoder_inputs):
        
#         encoder_outputs, state_h = self.encoder(encoder_inputs)
#         return state_h  



# class Attention(nn.Module):
#     def __init__(self, latent_dim):
#         super(Attention, self).__init__()
#         self.attention = nn.Linear(latent_dim * 2, latent_dim)
#         self.v = nn.Parameter(torch.rand(latent_dim))

#     def forward(self, decoder_hidden, encoder_outputs):
#         """
#         decoder_hidden: (batch_size, latent_dim)
#         encoder_outputs: (batch_size, seq_len, latent_dim)
#         """
#         seq_len = encoder_outputs.size(1)
#         decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)  # (batch_size, seq_len, latent_dim)
#         combined = torch.cat((decoder_hidden, encoder_outputs), dim=2)      # (batch_size, seq_len, latent_dim * 2)
#         energy = torch.tanh(self.attention(combined))                       # (batch_size, seq_len, latent_dim)
#         energy = energy @ self.v                                           # (batch_size, seq_len)
#         attention_weights = F.softmax(energy, dim=1)                       # (batch_size, seq_len)
#         context_vector = attention_weights.unsqueeze(1) @ encoder_outputs  # (batch_size, 1, latent_dim)
#         return context_vector.squeeze(1), attention_weights                # (batch_size, latent_dim), (batch_size, seq_len)

# class DecoderModel(nn.Module):
#     def __init__(self):
#         super(DecoderModel, self).__init__()

#         self.decoder = nn.GRU(input_size=config.num_decoder_tokens + config.latent_dim, 
#                               hidden_size=config.latent_dim, 
#                               batch_first=True)
#         self.attention = Attention(config.latent_dim)
#         self.decoder_dense = nn.Linear(config.latent_dim, config.num_decoder_tokens)

#     def forward(self, decoder_inputs, encoder_outputs, decoder_hidden):
#         """
#         decoder_inputs: (batch_size, seq_len, num_decoder_tokens)
#         encoder_outputs: (batch_size, encoder_seq_len, latent_dim)
#         decoder_hidden: (1, batch_size, latent_dim)
#         """
#         seq_len = decoder_inputs.size(1)
#         all_outputs = []

#         for t in range(seq_len):
#             decoder_input_t = decoder_inputs[:, t, :].unsqueeze(1)  # (batch_size, 1, num_decoder_tokens)
#             context_vector, attention_weights = self.attention(decoder_hidden[-1], encoder_outputs)  # (batch_size, latent_dim)
#             context_vector = context_vector.unsqueeze(1)           # (batch_size, 1, latent_dim)
#             decoder_input_combined = torch.cat((decoder_input_t, context_vector), dim=2)  # (batch_size, 1, input_size + latent_dim)
#             decoder_output, decoder_hidden = self.decoder(decoder_input_combined, decoder_hidden)  # (batch_size, 1, latent_dim), (1, batch_size, latent_dim)
#             all_outputs.append(decoder_output)
        
#         decoder_outputs = torch.cat(all_outputs, dim=1)  # (batch_size, seq_len, latent_dim)
#         final_outputs = self.decoder_dense(decoder_outputs)  # (batch_size, seq_len, num_decoder_tokens)
#         return final_outputs, attention_weights





In [None]:
#  