In [2]:
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import distutils.version
from torch.utils.tensorboard import SummaryWriter
from utils import save_checkpoint, load_checkpoint, print_examples
from get_loader import get_loader
from model import CNNtoRNN
from nltk.translate.bleu_score import corpus_bleu
import os
import glob

In [3]:

train_loader, train_dataset = get_loader(
    root_folder="/SSD/ne6101157/pac4_mini/train",
    transform=None,
    num_workers=2,
)
test_loader, test_dataset = get_loader(
    root_folder="/SSD/ne6101157/pac4_mini/test",
    transform=None,
    batch_size=1,
    num_workers=2,
)


root: /SSD/ne6101157/pac4_mini/train
Number of videos: 512
Number of rules: 512
root: /SSD/ne6101157/pac4_mini/test
Number of videos: 162
Number of rules: 162


In [4]:

torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
load_model = False
save_model = True
train_CNN = False

# Hyperparameters
embed_size = 256
hidden_size = 256
vocab_size = len(train_dataset.vocab)
num_layers = 3
learning_rate = 3e-4
num_epochs = 30

resume_epoch = 0
save_dir_root = os.path.join("/SSD/ne6101157/image_captioning/modelpath")
if resume_epoch != 0:
    runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*')))
    run_id = int(runs[-1].split('_')[-1]) if runs else 0
else:
    runs = sorted(glob.glob(os.path.join(save_dir_root, 'run', 'run_*')))
    run_id = int(runs[-1].split('_')[-1]) + 1 if runs else 0
save_dir = os.path.join(save_dir_root, 'run', 'run_' + str(run_id))
load_dir = os.path.join(save_dir_root, 'run', 'run_' + str(run_id-1))

Using device: cuda


In [5]:

# for tensorboard
writer = SummaryWriter("runs/pac4")
step = 0

classifier = "C3D" # 'R3D' or 'R2Plus1D' or 'C3D' or 'PacR3D'
# initialize model, loss etc
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers, classifier).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab.stoi["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

if classifier == "R3D":
    pretrained_R3D_weights = torch.load('/SSD/ne6101157/image_captioning/modelpath/run/run_10/models/R3D-pac4_epoch-19.pth.tar')
    model.encoderCNN.R3D.load_state_dict(pretrained_R3D_weights['state_dict'])
    for param in model.encoderCNN.R3D.parameters():
        param.requires_grad = False
elif classifier == "R2Plus1D":
    pretrained_R2Plus1D_weights = torch.load('/SSD/ne6101157/image_captioning/modelpath/run/run_10/models/R2Plus1D-pac4_epoch-19.pth.tar')
    model.encoderCNN.R2Plus1D.load_state_dict(pretrained_R2Plus1D_weights['state_dict'])
    for param in model.encoderCNN.R2Plus1D.parameters():
        param.requires_grad = False
elif classifier == "C3D":
    pretrained_C3D_weights = torch.load('/SSD/ne6101157/image_captioning/modelpath/run/run_10/models/C3D-pac4_epoch-19.pth.tar')
    model.encoderCNN.C3D.load_state_dict(pretrained_C3D_weights['state_dict'])
    for param in model.encoderCNN.C3D.parameters():
        param.requires_grad = False
elif classifier == "PacR3D":
    pretrained_PacR3D_weights = torch.load('/SSD/ne6101157/image_captioning/modelpath/run/run_10/models/C3D-pac4_epoch-9.pth.tar')
    model.encoderCNN.PacR3D.load_state_dict(pretrained_PacR3D_weights['state_dict'])
    for param in model.encoderCNN.PacR3D.parameters():
        param.requires_grad = False

conv1.weight
torch.Size([64, 1, 3, 3, 3])
conv1.bias
torch.Size([64])
bn1.weight
torch.Size([64])
bn1.bias
torch.Size([64])
bn1.running_mean
torch.Size([64])
bn1.running_var
torch.Size([64])
bn1.num_batches_tracked
torch.Size([])
conv2.weight
torch.Size([128, 64, 3, 3, 3])
conv2.bias
torch.Size([128])
bn2.weight
torch.Size([128])
bn2.bias
torch.Size([128])
bn2.running_mean
torch.Size([128])
bn2.running_var
torch.Size([128])
bn2.num_batches_tracked
torch.Size([])
conv3a.weight
torch.Size([256, 128, 3, 3, 3])
conv3a.bias
torch.Size([256])
conv3b.weight
torch.Size([256, 256, 3, 3, 3])
conv3b.bias
torch.Size([256])
bn3.weight
torch.Size([256])
bn3.bias
torch.Size([256])
bn3.running_mean
torch.Size([256])
bn3.running_var
torch.Size([256])
bn3.num_batches_tracked
torch.Size([])
conv4a.weight
torch.Size([512, 256, 3, 3, 3])
conv4a.bias
torch.Size([512])
conv4b.weight
torch.Size([512, 512, 3, 3, 3])
conv4b.bias
torch.Size([512])
bn4.weight
torch.Size([512])
bn4.bias
torch.Size([512])
bn4.runni

In [6]:
file_name = os.path.join(load_dir, 'models', '_epoch-' + str(29) + '.pth.tar')
print("Load model from {}\n".format(os.path.join(load_dir, 'models', 'epoch-' + str(29) + '.pth.tar')))
            
if load_model:
    step = load_checkpoint(torch.load(file_name), model, optimizer)


Load model from /SSD/ne6101157/image_captioning/modelpath/run/run_9/models/epoch-29.pth.tar



In [6]:

print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
model.train()

for epoch in range(num_epochs):
    #Uncomment the line below to see a couple of test cases
    # print_examples(model, device, dataset)
    print(epoch, "/", num_epochs)
    if save_model:
        
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "step": step,
        }
        if epoch % 10 == (9):
            file_name = os.path.join(save_dir, 'models', '_' + classifier +'_epoch-' + str(epoch) + '.pth.tar')
            print("Save model at {}\n".format(os.path.join(save_dir, 'models', 'epoch-' + str(epoch) + '.pth.tar')))
            save_checkpoint(checkpoint, filename=file_name)

    for idx, (imgs, captions) in tqdm(
        enumerate(train_loader), total=len(train_loader), leave=False
    ):
        imgs = imgs.to(device)
        captions = captions.to(device)
        
        outputs = model(imgs, captions[:-1])
        loss = criterion(
            outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
        )

        writer.add_scalar("Training loss", loss.item(), global_step=step)
        step += 1

        optimizer.zero_grad()
        loss.backward(loss)
        optimizer.step()


Total params: 37.92M
0 / 30


  0%|          | 0/128 [00:00<?, ?it/s]

Sequential(
  (0): Conv3d(1, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (3): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (4): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (5): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (6): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (7): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (8): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (9): MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (10): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (11): Conv3d

                                       

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [256, 32768, 1, 1], but got 5-dimensional input of size [4, 512, 32, 3, 3] instead

In [None]:

model.eval()
# validate with test data
actual, predicted = list(), list()

for idx, (imgs, captions) in tqdm(
        enumerate(test_loader), total=len(test_loader), leave=False
    ):
    imgs = imgs.to(device)
    # predict the caption for image
    y_pred = model.caption_video(imgs, test_dataset.vocab)
    # split into words
    actual_captions = [test_dataset.vocab.itos[int(caption)] for caption in captions]
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)


In [None]:
from rouge import Rouge

# 創建ROUGE評估器
rouge_evaluator = Rouge()
candidates = []
references = []
for i, txt in enumerate(predicted):
    candidates.append(' '.join(txt))
for i, txt in enumerate(actual):
    references.append(' '.join(txt))
# 計算ROUGE分數
rouge_scores = rouge_evaluator.get_scores(candidates, references)

# Initialize variables for accumulating scores
total_rouge_1_score = 0
total_rouge_2_score = 0
total_rouge_l_score = 0
total_rouge_1_p = 0
total_rouge_2_p = 0
total_rouge_l_p = 0
total_rouge_1_r = 0
total_rouge_2_r = 0
total_rouge_l_r = 0
# Accumulate scores for all candidate-reference pairs
for i in range(len(candidates)):
    rouge_1_score = rouge_scores[i]['rouge-1']['f']
    rouge_1_p = rouge_scores[i]['rouge-1']['p']
    rouge_1_r = rouge_scores[i]['rouge-1']['r']
    rouge_2_score = rouge_scores[i]['rouge-2']['f']
    rouge_2_p = rouge_scores[i]['rouge-2']['p']
    rouge_2_r = rouge_scores[i]['rouge-2']['r']
    rouge_l_score = rouge_scores[i]['rouge-l']['f']
    rouge_l_p = rouge_scores[i]['rouge-l']['p']
    rouge_l_r = rouge_scores[i]['rouge-l']['r']
    total_rouge_1_score += rouge_1_score
    total_rouge_2_score += rouge_2_score
    total_rouge_l_score += rouge_l_score
    total_rouge_1_p += rouge_1_p
    total_rouge_2_p += rouge_2_p
    total_rouge_l_p += rouge_l_p
    total_rouge_1_r += rouge_1_r
    total_rouge_2_r += rouge_2_r
    total_rouge_l_r += rouge_l_r

# Compute average scores
avg_rouge_1_score = total_rouge_1_score / len(candidates)
avg_rouge_2_score = total_rouge_2_score / len(candidates)
avg_rouge_l_score = total_rouge_l_score / len(candidates)
avg_rouge_1_p = total_rouge_1_p / len(candidates)
avg_rouge_2_p = total_rouge_2_p / len(candidates)
avg_rouge_l_p = total_rouge_l_p / len(candidates)
avg_rouge_1_r = total_rouge_1_r / len(candidates)
avg_rouge_2_r = total_rouge_2_r / len(candidates)
avg_rouge_l_r = total_rouge_l_r / len(candidates)
# Print average scores
print("Average ROUGE-1 score:", avg_rouge_1_score)
print("Average ROUGE-2 score:", avg_rouge_2_score)
print("Average ROUGE-L score:", avg_rouge_l_score)
# Print average scores
print("Average ROUGE-1 precision:", avg_rouge_1_p)
print("Average ROUGE-2 precision:", avg_rouge_2_p)
print("Average ROUGE-L precision:", avg_rouge_l_p)
# Print average scores
print("Average ROUGE-1 recall:", avg_rouge_1_r)
print("Average ROUGE-2 recall:", avg_rouge_2_r)
print("Average ROUGE-L recall:", avg_rouge_l_r)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Calculate BLEU-1 score
bleu_1_score = corpus_bleu(actual, predicted, weights=(1, 0, 0, 0))

# Print BLEU-1 score
print("BLEU-1 score:", bleu_1_score)