In [12]:
import torch
torch.cuda.empty_cache()
# from models.models import BaseCMNModel
import argparse
import numpy as np

import gc
gc.collect()

0

In [13]:
def parse_agrs():
    parser = argparse.ArgumentParser()

    # Data input settings
    parser.add_argument('--image_dir', type=str, default='/autodl-fs/data/iu_xray/images',
                        help='the path to the directory containing the data.')
    parser.add_argument('--ann_path', type=str, default='./data/annotation.json',
                        help='the path to the directory containing the data.')

    parser.add_argument("--isSingle", action="store_true")
    parser.add_argument("--AM", type=str, default='CC', choices=['VFAM', 'SA', "CC"], help="do you use Image alignment?")
    # parser.add_argument("--useIDAM", action="store_false", help="do you use IDAM?")

    # VTFCM
    parser.add_argument("--useVTFCM", action="store_false", help="do you use MAM?")

    # KL
    parser.add_argument("--useKG", action="store_false", help="do you use MAM?")

    # Data loader settings
    parser.add_argument('--dataset_name', type=str, default='iu_xray', choices=['iu_xray', 'mimic_cxr'],
                        help='the dataset to be used.')
    parser.add_argument('--max_seq_length', type=int, default=60, help='the maximum sequence length of the reports.')
    parser.add_argument('--threshold', type=int, default=3, help='the cut off frequency for the words.')
    parser.add_argument('--num_workers', type=int, default=2, help='the number of workers for dataloader.')
    parser.add_argument('--batch_size', type=int, default=2, help='the number of samples for a batch')

    # Model settings (for visual extractor)
    parser.add_argument('--visual_extractor', type=str, default='resnet101', help='the visual extractor to be used.')
    parser.add_argument('--visual_extractor_pretrained', type=bool, default=True, help='whether to load the pretrained visual extractor')

    # Model settings (for Transformer)
    parser.add_argument('--d_model', type=int, default=512, help='the dimension of Transformer.')
    parser.add_argument('--d_ff', type=int, default=512, help='the dimension of FFN.')
    parser.add_argument('--d_vf', type=int, default=2048, help='the dimension of the patch features.')
    parser.add_argument('--num_heads', type=int, default=8, help='the number of heads in Transformer.')
    parser.add_argument('--num_layers', type=int, default=3, help='the number of layers of Transformer.')
    parser.add_argument('--dropout', type=float, default=0.1, help='the dropout rate of Transformer.')
    parser.add_argument('--logit_layers', type=int, default=1, help='the number of the logit layer.')
    parser.add_argument('--bos_idx', type=int, default=0, help='the index of <bos>.')
    parser.add_argument('--eos_idx', type=int, default=0, help='the index of <eos>.')
    parser.add_argument('--pad_idx', type=int, default=0, help='the index of <pad>.')
    parser.add_argument('--use_bn', type=int, default=0, help='whether to use batch normalization.')
    parser.add_argument('--drop_prob_lm', type=float, default=0.5, help='the dropout rate of the output layer.')

    # for Cross-modal Memory
    parser.add_argument('--topk', type=int, default=32, help='the number of k.')
    parser.add_argument('--cmm_size', type=int, default=2048, help='the numebr of cmm size.')
    parser.add_argument('--cmm_dim', type=int, default=512, help='the dimension of cmm dimension.')

    # Sample related
    parser.add_argument('--sample_method', type=str, default='beam_search', help='the sample methods to sample a report.')
    parser.add_argument('--beam_size', type=int, default=3, help='the beam size when beam searching.')
    parser.add_argument('--temperature', type=float, default=1.0, help='the temperature when sampling.')
    parser.add_argument('--sample_n', type=int, default=1, help='the sample number per image.')
    parser.add_argument('--group_size', type=int, default=1, help='the group size.')
    parser.add_argument('--output_logsoftmax', type=int, default=1, help='whether to output the probabilities.')
    parser.add_argument('--decoding_constraint', type=int, default=0, help='whether decoding constraint.')
    parser.add_argument('--block_trigrams', type=int, default=1, help='whether to use block trigrams.')

    # Trainer settings
    parser.add_argument('--n_gpu', type=int, default=1, help='the number of gpus to be used.')
    parser.add_argument('--epochs', type=int, default=100, help='the number of training epochs.')
    parser.add_argument('--save_dir', type=str, default='results/iu_xray/', help='the patch to save the models.')
    parser.add_argument('--record_dir', type=str, default='records/', help='the patch to save the results of experiments.')
    parser.add_argument('--log_period', type=int, default=50, help='the logging interval (in batches).')
    parser.add_argument('--save_period', type=int, default=10, help='the saving period (in epochs).')
    parser.add_argument('--monitor_mode', type=str, default='max', choices=['min', 'max'], help='whether to max or min the metric.')
    parser.add_argument('--monitor_metric', type=str, default='BLEU_4', help='the metric to be monitored.')
    parser.add_argument('--early_stop', type=int, default=50, help='the patience of training.')

    # Optimization
    parser.add_argument('--optim', type=str, default='Adam', help='the type of the optimizer.')
    parser.add_argument('--lr_ve', type=float, default=1e-4, help='the learning rate for the visual extractor.')
    parser.add_argument('--lr_ed', type=float, default=5e-4, help='the learning rate for the remaining parameters.')
    parser.add_argument('--weight_decay', type=float, default=5e-5, help='the weight decay.')
    parser.add_argument('--adam_betas', type=tuple, default=(0.9, 0.98), help='the weight decay.')
    parser.add_argument('--adam_eps', type=float, default=1e-9, help='the weight decay.')
    parser.add_argument('--amsgrad', type=bool, default=True, help='.')
    parser.add_argument('--noamopt_warmup', type=int, default=5000, help='.')
    parser.add_argument('--noamopt_factor', type=int, default=1, help='.')

    # Learning Rate Scheduler
    parser.add_argument('--lr_scheduler', type=str, default='StepLR', help='the type of the learning rate scheduler.')
    parser.add_argument('--step_size', type=int, default=10, help='the step size of the learning rate scheduler.')
    parser.add_argument('--gamma', type=float, default=0.8, help='the gamma of the learning rate scheduler.')

    # Others
    parser.add_argument('--seed', type=int, default=9233, help='.')
    parser.add_argument('--resume', type=str, help='whether to resume the training from existing checkpoints.')
    parser.add_argument('--load', type=str, default="results/KGENwoVKFF/model_best_5.pth", help='whether to load the pre-trained model.')


    args, _ = parser.parse_known_args()
    return args

In [14]:
from models.models import BaseCMNModel
from modules.dataloaders import R2DataLoader
from modules.loss import compute_loss
from modules.metrics import compute_scores
from modules.tokenizers import Tokenizer
from modules.tester import Tester



args = parse_agrs()
weight = torch.load(args.load)
model_weight = weight.get('state_dict')
# fix random seeds
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(args.seed)
tokenizer = Tokenizer(args)

# create data loader
test_dataloader = R2DataLoader(args, tokenizer, split='test', shuffle=False)

# build model architecture
model = BaseCMNModel(args, tokenizer)



In [15]:
args

Namespace(AM='CC', adam_betas=(0.9, 0.98), adam_eps=1e-09, amsgrad=True, ann_path='./data/annotation.json', batch_size=2, beam_size=3, block_trigrams=1, bos_idx=0, cmm_dim=512, cmm_size=2048, d_ff=512, d_model=512, d_vf=2048, dataset_name='iu_xray', decoding_constraint=0, drop_prob_lm=0.5, dropout=0.1, early_stop=50, eos_idx=0, epochs=100, gamma=0.8, group_size=1, image_dir='/autodl-fs/data/iu_xray/images', isSingle=False, load='results/KGENwoVKFF/model_best_5.pth', log_period=50, logit_layers=1, lr_ed=0.0005, lr_scheduler='StepLR', lr_ve=0.0001, max_seq_length=60, monitor_metric='BLEU_4', monitor_mode='max', n_gpu=1, noamopt_factor=1, noamopt_warmup=5000, num_heads=8, num_layers=3, num_workers=2, optim='Adam', output_logsoftmax=1, pad_idx=0, record_dir='records/', resume=None, sample_method='beam_search', sample_n=1, save_dir='results/iu_xray/', save_period=10, seed=9233, step_size=10, temperature=1.0, threshold=3, topk=32, useKG=True, useVTFCM=True, use_bn=0, visual_extractor='resnet

In [16]:
model.load_state_dict(model_weight)
model = model.to('cuda')

In [17]:
from PIL import Image
import os

In [18]:
from torchvision import transforms
transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406),
                                     (0.229, 0.224, 0.225))])

def test(image1, image2, report_1, cls1, image3, image4, report_2, cls2):
        image_1 = Image.open(os.path.join(args.image_dir, image1)).convert('RGB')
        image_2 = Image.open(os.path.join(args.image_dir, image2)).convert('RGB')
        image_3 = Image.open(os.path.join(args.image_dir, image3)).convert('RGB')
        image_4 = Image.open(os.path.join(args.image_dir, image4)).convert('RGB')
        if transform is not None:
                image_1 = transform(image_1)
                image_2 = transform(image_2)
                image_3 = transform(image_3)
                image_4 = transform(image_4)
        image11 = torch.stack((image_1, image_2), 0)
        image22 = torch.stack((image_3, image_4), 0)
        image = torch.stack((image11, image22), 0)
        ids1 = tokenizer(report_1)[:60]
        ids2 = tokenizer(report_2)[:60]
        max_seq_length = max(len(ids1), len(ids2))
        target_batch = np.zeros((2, max_seq_length), dtype=int)
        target_batch[0, : len(ids1)] = ids1
        target_batch[1, : len(ids2)] = ids2
        # masked = [1] * len(ids)
        ids = torch.LongTensor(target_batch)
        output = model(image.to("cuda"), cls=torch.Tensor([cls1, cls2]).to("cuda"),targets=ids.to("cuda"), mode='sample')
        reports = model.tokenizer.decode_batch(output[0].cpu().numpy())
        ground_truths = model.tokenizer.decode_batch(ids[:, 1:].cpu().numpy())
        return reports, ground_truths

In [19]:
import json
import pandas as pd
annotationFile = args.ann_path
with open(args.ann_path, 'r') as f:
    testDict = json.load(f)["test"]
inputs = []
for i in range(0, len(testDict), 2):
    inputs.append(testDict[i]["image_path"] + [testDict[i]["report"]] + [testDict[i]["cls"]] + testDict[i+1]["image_path"] + [testDict[i+1]["report"]] + [testDict[i+1]["cls"]])
reports, ground_truths = [], []
# step = len(inputs)
for index, ins in enumerate(inputs):
    report, ground_truth =  test(*ins)
    reports += report
    ground_truths += ground_truth
pd.DataFrame(np.array([reports, ground_truths]).T, columns=["res", "gts"]).to_csv("results/KGENwoVKFF/res.csv", index=False, sep="|")