In [None]:
### install dependencies

In [123]:
# ! pip install ruamel_yaml
# ! pip install pandas
# ! pip install easydict
# ! pip install torch
# ! pip install torchvision
# ! pip install spacy




In [124]:
PROJ_DIR='/home/anurag/Med_VQA/'
CNF_PATH='/home/anurag/Med_VQA/default.yaml'

In [125]:
import os
import pandas as pd
from typing import Union, Optional, Any, List
from pathlib import Path
from easydict import EasyDict as edict
from ruamel.yaml import YAML, yaml_object
import pickle
import json
# from mvqag import PROJ_DIR

In [126]:
yaml = YAML(typ='safe', pure=True)
yaml.default_flow_style = False
yaml.indent(mapping=2, sequence=4, offset=2)

In [127]:
@yaml_object(yaml)
class JoinPath:
    """Custom tag `!join` loader class to join strings for yaml file."""

    yaml_tag = u'!joinpath'

    def __init__(self, joined_string):
        self.joined_string = joined_string

    @classmethod
    def to_yaml(cls, representer, node):
        return representer.represent_scalar(cls.yaml_tag,
                                            u'{.joined_string}'.format(node))

    @classmethod
    def from_yaml(cls, constructor, node):
        seq = constructor.construct_sequence(node)
        fullpath = Path(os.path.join(*[str(i) for i in seq])).resolve()
        return str(fullpath)

In [128]:
@yaml_object(yaml)
class ProjDirSetter:
    """Custom tag `!projdir` loader class for yaml file."""

    yaml_tag = u'!projdir'

    def __init__(self, path):
        self.path = path

    @classmethod
    def to_yaml(cls, representer, node):
        return representer.represent_scalar(cls.yaml_tag,
                                            u'{.path}'.format(node))

    @classmethod
    def from_yaml(cls, constructor, node):
        return str(PROJ_DIR)

In [129]:
def load_yaml(path: Union[str, Path], pure: bool = False) -> dict:
    """config.yaml file loader.
    This function converts the config.yaml file to `dict` object.
    Args:
        path: .yaml configuration filepath
        pure: If True, just load the .yaml without converting to EasyDict
            and exclude extra info.
    Returns:
        `dict` object containing configuration parameters.
    Example:
        .. code-block:: python
            config = load_yaml("../config.yaml")
            print(config["project_name"])
    """

    path = str(Path(path).absolute().resolve())
    # * Load config file
    with open(path) as file:
        config = yaml.load(file)

    if pure == False:  # Add extra features
        # Convert dict to easydict
        config = edict(config)
    return config

In [130]:
# ----------------------------------------> .txt and .csv :
def load_qa_file(
    qa_filepath: Union[Path, str], columns: Optional[List[str]] = None
) -> pd.DataFrame:
    """Load questions answers `.txt` and `.csv` files.

    Args:
        qa_filepath: full path to the file
        columns: if file is `.txt`, then enter the columns names.

    Returns:
        pandas dataframe.
    """

    qa_filepath = Path(qa_filepath)
    # print('qa_filepath',qa_filepath)
    if qa_filepath.suffix == ".txt":  # For raw data files
        if columns == None:
            raise AttributeError(
                "columns=<list of names of colunms> is required for .txt files."
            )
        df = pd.read_table(qa_filepath, delimiter="|", header=None)
        assert df.shape[1] == len(
            columns
        ), f"[Error @ `load_qa_file`] Number of columns in dataframe are {df.shape[1]} but given columns list has {len(columns)} names."
        df.columns = columns

    elif qa_filepath.suffix == ".csv":  # if file is .csv
        df = pd.read_csv(qa_filepath)
    else:
        raise ValueError(
            "[Error @ `load_qa_file`] Problem loading .txt qa_filepath")
    return df

In [131]:
# ----------------------------------------> JSON :
def load_json(path: Union[str, Path], pure: bool = False) -> edict:
    """Load .json file from given path

    Args:
        path: Path/to/the/file.json
        pure: If True, return the loaded .json content as it is.
            If False, convert it to EasyDict format
    """
    import json
    with open(path) as f:
        _data = json.load(f)
        if pure:
            return _data
        return edict(_data)


def save_json(object: Any, path: Union[str, Path]) -> None:
    """Save .json file to given path"""
    import json
    path = Path(path).resolve()
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(str(path), 'w') as f:
        json.dump(object, f)

In [132]:
CNF = load_yaml(CNF_PATH)

In [133]:
CNF

{'seed': 1234,
 'n_workers': 4,
 'project_name': 'Med_VQA',
 'clef_cols': {'default': ['ID', 'Q', 'A'],
  'default18': ['S.No', 'ID', 'Q', 'A'],
  'test19': ['ID', 'Task', 'Q', 'A'],
  'test20': ['ID', 'Q'],
  'test20A': ['ID', 'Q', 'A'],
  'test21': ['ID', 'Q']},
 'data': {'augment': True,
  'aug_type': 'manual',
  'normalize': True,
  'mean': [0.485, 0.456, 0.406],
  'std': [0.229, 0.224, 0.225],
  'QG': False,
  'n_classes': 330},
 'model': {'qnet_name': 'GRU',
  'vnet_name': 'VGG16',
  'vdp': 0.2,
  'qdp': 0.2,
  'inp_size': 224,
  'max_len': 12,
  'emb_dim': 128,
  'use_SAN': False},
 'loss': {'fn': 'CrossEntropy', 'smoothing': 0.0, 'wts': 'None'},
 'optm': {'name': 'SGD',
  'lr': 0.001,
  'wd': 0.0005,
  'mom': 0.9,
  'nesterov': False,
  'betas': [0.9, 0.99],
  'amsgrad': True},
 'train': {'bs': 32,
  'n_epochs': 60,
  'swa': False,
  'vqa_mixup': 0.0,
  'max_train_iters': -1,
  'max_val_iters': -1},
 'test': {'bs': 1},
 'task_keywords': {'abnormality': ['normal',
   'abnormal',

In [134]:
import torch
import torchvision.transforms as T
import math
from typing import Union, Dict, List, Any, Optional, Callable
from pandas import DataFrame
from PIL import Image
import re
import pandas as pd
from typing import Set, List, Dict
from collections import defaultdict
from collections import OrderedDict
from pathlib import Path
from spacy.lang.en import English
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
nlp = English()

In [135]:
# ============================= FUNCTIONS ==============================
def get_device():
    """Get torch device instance and available GPU IDs"""
    from torch.cuda import device_count
    from torch import device
    print('device_count',device_count())
    cuda_ids = [0] if device_count() == 1 else list(range(device_count()))
    
    return device(f"cuda:{cuda_ids[0]}"), cuda_ids

In [136]:
torch.manual_seed(CNF.seed)
torch.cuda.manual_seed(CNF.seed)
torch.cuda.manual_seed_all(CNF.seed)

In [137]:
class Tokenizer:
    def __init__(self, vocab: Set[str], max_len: int) -> None:
        self.pad = '<pad>'
        self.bos = '<bos>'
        self.eos = '<eos>'
        self.unk = '<unk>'
        self.special_tokens = [self.pad, self.bos, self.eos, self.unk]
        self.vocab = self.special_tokens + sorted(list(vocab))
        self.vocab = {w: i for i, w in enumerate(self.vocab)}
        self.vocab = defaultdict(lambda: self.vocab[self.unk], self.vocab)
        self.idx_to_word = {v: k for k, v in self.vocab.items()}
        self.max_len = max_len
        self.vocab_dim = len(self.vocab)

    def encode(self, sent: str) -> Dict[str, list]:
        sent = sent.lower()
        tokens = [
            self.vocab[str(word)] for word in nlp.tokenizer(sent)
        ]
        tokens = tokens[:self.max_len - 2]
        # Add sentence begining and end
        tokens = [self.vocab[self.bos]] + tokens + [self.vocab[self.eos]]
        total_tokens = len(tokens)
        if total_tokens != self.max_len:
            rem = self.max_len - total_tokens
            tokens += [self.vocab[self.pad]]*rem
        return {
            'input_ids': tokens
        }

    def decode(self, tokens: List[int]) -> str:
        tokens = [self.idx_to_word[t] for t in tokens]
        decoded = [
            token
            for token in tokens
            if (token != self.vocab.pad) and
               (token != self.vocab.bos) and
               (token != self.vocab.eos)
        ]
        return ' '.join(decoded)

    @classmethod
    def from_list(cls, unique_ques_list: List[str], max_len: int):
        vocab = set([
            str(word) for sent in unique_ques_list
            for word in nlp.tokenizer(sent.lower())
        ])
        return cls(vocab, max_len=max_len)

In [138]:
class DataLoaders:
    def __init__(self,
                 trainloader: Union[DataLoader, None] = None,
                 valloader: Union[DataLoader, None] = None,
                 testloader: Union[DataLoader, None] = None,
                 n_workers: int = 4) -> None:
        if trainloader:
            self.train_bs = trainloader.batch_size
            self.trainset = trainloader.dataset
            self.trainloader = trainloader
        if valloader:
            self.val_bs = valloader.batch_size
            self.valset = valloader.dataset
            self.valloader = valloader
        if testloader:
            self.test_bs = testloader.batch_size
            self.testset = testloader.dataset
            self.testloader = testloader
        self.n_workers = n_workers

    @classmethod
    def from_dataset(cls,
                     trainset: Optional[Dataset] = None,
                     train_bs: Optional[int] = None,
                     valset: Optional[Dataset] = None,
                     val_bs: Optional[int] = None,
                     testset: Optional[Dataset] = None,
                     test_bs: Optional[int] = None,
                     n_workers: Optional[int] = 4,
                     collate_fn: Optional[Callable] = None
                     ) -> Dict[str, DataLoader]:
        """Create dataloaders from dataset classes

        Args:
            #TODO
        """
        if trainset == valset == testset == None:
            raise ValueError("At least one set is required")
        trainloader, valloader, testloader = None, None, None
        trainset, train_bs = trainset, train_bs
        valset, val_bs = valset, val_bs
        testset, test_bs = testset, test_bs
        if trainset is not None:
            trainloader = DataLoader(
                dataset=trainset,
                batch_size=train_bs,
                shuffle=True,
                num_workers=n_workers,
                collate_fn=collate_fn
            )
        if valset is not None:
            valloader = DataLoader(
                dataset=valset,
                batch_size=val_bs,
                shuffle=False,
                num_workers=n_workers,
                collate_fn=collate_fn
            )
        if testset is not None:
            testloader = DataLoader(
                dataset=testset,
                batch_size=test_bs,
                shuffle=False,
                num_workers=n_workers,
                collate_fn=collate_fn
            )
        return cls(
            trainloader=trainloader,
            valloader=valloader,
            testloader=testloader,
            n_workers=n_workers
        )

    def collate_fn(self):
        return None

    @property
    def n_train_batches(self) -> int:
        if self.trainset is not None:
            return math.ceil(len(self.trainset) / self.train_bs)

    @property
    def n_val_batches(self) -> Union[None, int]:
        if self.valset is not None:
            return math.ceil(len(self.valset) / self.val_bs)

    @property
    def n_test_batches(self) -> Union[None, int]:
        if self.testset is not None:
            return math.ceil(len(self.testset) / self.test_bs)

    @classmethod
    def from_dicts(cls,
                   train_dict: Dict[str, Any],
                   val_dict: Optional[Dict[str, Any]] = None,
                   test_dict: Optional[Dict[str, Any]] = None):
        valset, val_bs = val_dict['ds'], val_dict['bs']
        testset, test_bs = test_dict['ds'], test_dict['bs']
        return cls(trainset=train_dict['ds'],  # DataSet
                   train_bs=train_dict['bs'],  # BatchSize
                   valset=valset,
                   val_bs=val_bs,
                   testset=testset,
                   test_bs=test_bs
                   )

In [139]:
# ============================= CLASSES ==============================
class VQADataset(Dataset):
    def __init__(self,
                 df: DataFrame,
                 img_tfms: T.Compose,
                 classes: Union[None, List[str]],
                 tokenizer: Union[None, Callable] = None) -> None:
        """PyTorch Dataset abstract class wrapper

        Args:
            #TODO
        """
        super().__init__()

        self.df = df
        self.img_tfms = img_tfms
        self.classes = classes
        if self.classes is not None:
            self.n_classes = len(self.classes)
        self.tokenizer = tokenizer

    def __len__(self) -> int:
        return self.df.shape[0]

    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        row = self.df.iloc[idx]

        # Visual
        img = Image.open(row.PATH).convert('RGB')
        if self.img_tfms is not None:
            img = self.img_tfms(img)

        # Question
        if self.tokenizer is None:
            question = None
        else:
            question = self.tokenizer.encode(row.Q)
            question = {
                k: torch.tensor(v).unsqueeze(0) for k, v in question.items()
            }

        # Answer
        target = torch.tensor(self.ctol(row.A))

        return {
            'inputs': {'V': img, 'Q': question},
            'target': target,
            'info': row
        }

    def ctol(self, c) -> int:
        return self.classes.index(c)

    def ltoc(self, l) -> str:
        return self.classes[l]

    @property
    def cls_wts(self) -> torch.Tensor:
        """Class balancing weights for balanced training"""
        n_samples = [None]*self.n_classes
        for cls, count in self.df.A.value_counts().to_dict().items():
            if self.classes is not None:
                label = self.ctol[cls]
            else: 
                label = cls
            n_samples[label] = count
        return sum(n_samples) / torch.tensor(n_samples)  # Class weights

In [140]:
def vqa_collate_fn(batch):
    collated = {
        'inputs': {
            'V': [],  # Image
            'Q': {'input_ids': []},  # Question
        },
        'target': []  # Answers
    }
    # Images
    for sample in batch:
        collated['inputs']['V'].append(sample['inputs']['V'].unsqueeze(0))
    collated['inputs']['V'] = torch.cat(collated['inputs']['V'])
    # Questions
    if batch[0]['inputs']['Q'] is None:
        collated['inputs']['Q'] = None
    else:
        for sample in batch:
            for k, v in sample['inputs']['Q'].items():
                collated['inputs']['Q'][k].append(v)
        collated['inputs']['Q'] = {
            k: torch.cat(v) for k, v in collated['inputs']['Q'].items()
        }

    # Answers
    for sample in batch:
        collated['target'].append(sample['target'].unsqueeze(0))
    collated['target'] = torch.cat(collated['target'])
    # Info
    collated['info'] = [sample['info'] for sample in batch]
    return collated

In [141]:
def match_task(question: str, keywords: list):
    """Match question words with the keywords.
    Return True and the matched word if at least one word is matched.
    """
    for word in question.split(" "):
        for kw in keywords:
            if word == kw:
                return word
    return ""

In [142]:
class PrepareCLEF2020DataWithQ:
    def __init__(self, CNF) -> None:
        self.n_classes = CNF.data.n_classes
        self.QG = CNF.data.QG

        # TRAINING DATA
        # This is the main dataset for training
        train20_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_20_train_qa,
            imgs_path=CNF.paths.clef_20_train_imgs,
            is_main_df=True
        )
        
        print('train20_df',train20_df.shape)
        # Get categorical abnormality classes
        self.classes = train20_df.A.unique().tolist()
        print('len classes',len(self.classes))
        # if self.n_classes == 330:
        #     if 'no' in self.classes:
        #         self.classes.remove('no')
        #     if 'yes' in self.classes:
        #         self.classes.remove('yes')
        self.classes = sorted(self.classes)
        print('classes',self.classes)
        print('len',len(self.classes))
        # assert len(self.classes) == self.n_classes
        # Remove yes/no classes from train20_df
        # train20_df = pd.DataFrame([
        #     row for row in train20_df.itertuples() if row.A in self.classes
        # ]).drop("Index", axis=1)

        
        # Filter abnormality data from other ImageCLEF datasets
        # print(type(train20_df))
        # display(train20_df)
        train19_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_19_train_qa,
            imgs_path=CNF.paths.clef_19_train_imgs
        )
        val19_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_19_val_qa,
            imgs_path=CNF.paths.clef_19_val_imgs
        )
        print('train19_df',train19_df.shape)
        print('val19_df',val19_df.shape)
        test19_df = self._make_dataframe(
            columns=CNF.clef_cols.test19,
            qa_path=CNF.paths.clef_19_test_qa,
            imgs_path=CNF.paths.clef_19_test_imgs
        )
        print('test19_df',test19_df.shape)
        # display(test19_df)
        # print('test19_df',test19_df)
        
        test19_df = test19_df.drop('Task', axis=1)
        training_dfs = [train19_df, val19_df, test19_df, train20_df]
        self.train_df = pd.concat(
            training_dfs, ignore_index=True
        ).reset_index(drop=True)

        print('self.train_df',self.train_df.shape)
        answer_freq = self._count_answer_freq(self.train_df)
        print('answer_freq',answer_freq)
        self.classes = self._get_most_frequent_classes(answer_freq, 1)
        self.train_df['labels'] = self.train_df['A'].apply(lambda x : self.classes[x] if x in self.classes else self.classes['UNKNOWN'])

        self.val_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_20_val_qa,
            imgs_path=CNF.paths.clef_20_val_imgs
        )



        self.val_df['labels'] = self.val_df['A'].apply(lambda x : self.classes[x] if x in self.classes else self.classes['UNKNOWN'])
        print('self.val_df',self.val_df.shape)

        self.test_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_21_val_qa,
            imgs_path=CNF.paths.clef_21_val_imgs
        )

        print('self.test_df',self.test_df.shape)

        # display(self.train_df)
        # Augmentation
        self.train_tfms = T.Compose([
            T.Resize(size=(CNF.model.inp_size + 8, CNF.model.inp_size + 8)),
            # T.AutoAugment(),
            T.RandomCrop(size=(CNF.model.inp_size, CNF.model.inp_size)),
            T.RandomHorizontalFlip(),
            T.ToTensor(),
            T.Normalize(
                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
            )
        ])
        self.val_tfms = T.Compose([
            T.Resize(size=(CNF.model.inp_size, CNF.model.inp_size)),
            T.ToTensor(),
            T.Normalize(
                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
            )
        ])
        # Generate new questions for training dataset
        if CNF.data.QG:
            print(f"Before QG: # training samples = {self.train_df.shape[0]}")
            self.train_df = self._generate_questions(self.train_df,
                                                     CNF.task_keywords)
            print(f"After QG: # training samples = {self.train_df.shape[0]}")
        self.tokenizer = Tokenizer.from_list(
            self.train_df.Q.unique().tolist(),
            max_len=CNF.model.max_len
        )
        # Make dataset classes
        self.trainset = VQADataset(
            self.train_df, self.train_tfms, self.classes, self.tokenizer
        )
        self.valset = VQADataset(
            self.val_df, self.val_tfms, self.classes, self.tokenizer
        )
        # Make dataloaders
        self.dls = DataLoaders.from_dataset(
            trainset=self.trainset,
            train_bs=CNF.train.bs,
            valset=self.valset,
            val_bs=1,
            collate_fn=vqa_collate_fn
        )

    def _make_dataframe(self,
                        columns,
                        qa_path,
                        imgs_path,
                        is_main_df=False):
        df = load_qa_file(qa_filepath=qa_path, columns=columns)
        df['PATH'] = df.ID.apply(lambda x: f"{imgs_path}/{x}.jpg")
        if not is_main_df:
            df = pd.DataFrame([
                row for row in df.itertuples() if row.A in self.classes
            ]).drop("Index", axis=1)
        return df
    
    def _count_answer_freq(self,train_df: pd.DataFrame):
        '''
        count the frequence of each unique answer on the dataset
        '''
        all_answers = train_df['A'].values
        answer_freq_dict = defaultdict(int)
        for answer in all_answers:
            answer_freq_dict[answer] += 1
        answer_freq_dict_sort = dict(sorted(answer_freq_dict.items(), key=lambda x: x[1], reverse=True))

        return answer_freq_dict_sort
    
    def _get_most_frequent_classes(self,answer_freq_dict, threshold=1):
        print('threshold',threshold)
        final_classes = defaultdict(int)
        index = 0

        for answer,ans_freq in answer_freq_dict.items():

            if ans_freq >= threshold:
                final_classes[answer]= index         
            else:
                final_classes['UNKNOWN'] = index
                break
            index += 1  
        final_classes['UNKNOWN'] = index
        with open('answer_classes_2020_subset19.json', 'w') as fp:
            json.dump(final_classes, fp)
        return final_classes

    # def _generate_questions(
    #     self,
    #     train_df: pd.DataFrame,
    #     task_keywords: Dict[str, List[str]]
    # ) -> pd.DataFrame:
    #     new_train_df = generate_new_questions_dataframe(
    #         train_df, task_keywords
    #     )
    #     new_train_df = new_train_df[new_train_df.Task == 'abnormality']
    #     new_train_df = new_train_df[new_train_df.SubTask == 'categorical']
    #     return new_train_df

    def check(self):
        print(f"Data augmentation:\n\t{self.train_tfms}", end='')
        print(f"\n\t{self.val_tfms}")
        print(f"# training samples = {self.train_df.shape}")
        print(f"# validation samples = {self.val_df.shape}")
        train_pickle_name='train20_subset19_df'
        self.train_df.to_pickle('/home/anurag/Med_VQA/train_dataset_pickle/'+train_pickle_name+'.pkl')
        self.train_df.to_csv('train20_subset19.csv')
        valid_pickle_name = 'val20_df'
        self.val_df.to_pickle('/home/anurag/Med_VQA/valid_dataset_pickle/'+valid_pickle_name+'.pkl')
        self.val_df.to_csv('val20.csv')
        test_pickle_name = 'test_val21_df'
        self.test_df.to_pickle('/home/anurag/Med_VQA/test_dataset_pickle/'+test_pickle_name+'.pkl')
        self.test_df.to_csv('test_val21.csv')
        print('saved')
        if self.n_classes == 330:
            assert self.train_df.shape == (4963, 5)
            assert self.val_df.shape == (472, 5)
        elif self.n_classes == 332:
            if self.QG:
                assert self.train_df.shape == (39704, 6)
            else:
                assert self.train_df.shape == (6583, 4)
            assert self.val_df.shape == (500, 4)
        else:
            _errmsg = f"check not added for classes = {self.n_classes}"
            raise NotImplementedError(_errmsg)
        assert self.train_df.A.nunique() <= self.n_classes
        assert self.val_df.A.nunique() <= self.n_classes
        assert len(self.trainset) == self.train_df.shape[0]
        assert len(self.valset) == self.val_df.shape[0]

        batch = next(iter(self.dls.trainloader))
        print(f"data batch:")
        print(f"\tV = {batch['inputs']['V'].shape}")
        print(f"\tQ = {batch['inputs']['Q']['input_ids'].shape}")
        print(f"\tA = {batch['target'].shape}")
        print("data check: [green]PASSED[/green]")

In [143]:
dm = PrepareCLEF2020DataWithQ(CNF)

train20_df (4000, 4)
len classes 332
classes ['abdominal abscess', 'abdominal aortic aneurysm', 'abdominal compartment syndrome', 'aberrant right subclavian artery (arsa)', 'achalasia', 'achilles tendon rupture', 'acl injury', 'acute appendicitis', 'acute calculous cholecystitis', 'acute mesenteric ischemia', 'acute pediatric osteomyelitis', 'acute pyelonephritis', 'adenocarcinoma of the gastroesophageal junction', 'adenocarcinoma of the lung', 'adenomyomatosis', 'adenomyosis', 'adpkd -autosomal dominant polycystic kidney disease', 'adrenal adenoma', 'adrenal myelolipoma', "amyand's hernia", 'amyloidosis', 'aneurysm, cerebral', 'aneurysmal bone cyst', 'angiomyolipoma', 'ankylosing spondylitis', 'anterior mediastinal mass differential diagnosis', 'aortic coarctation', 'aortic dissection', 'aortic dissection, aortic tear', 'aortic nipple', 'aortic pseudoaneurysm', 'appendiceal rupture', 'appendicitis', 'aspergillosis and the air-crescent sign', 'atelectasis', 'atlantoaxial instability', 

In [144]:
dm.check()

Data augmentation:
	Compose(
    Resize(size=(232, 232), interpolation=bilinear, max_size=None, antialias=warn)
    RandomCrop(size=(224, 224), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
	Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
# training samples = (6583, 5)
# validation samples = (500, 5)
saved


AssertionError: 

In [145]:
class PrepareCLEF2019DataWithQ:
    def __init__(self, CNF) -> None:
        self.n_classes = CNF.data.n_classes
        self.QG = CNF.data.QG

        # TRAINING DATA
        # This is the main dataset for training
        train19_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_19_train_qa,
            imgs_path=CNF.paths.clef_19_train_imgs,
            is_main_df=True
        )
        
        print('train19_df',train19_df.shape)
        print('unique',len(train19_df.A.unique().tolist()))
        answer_freq = self._count_answer_freq(train19_df)
        print('answer_freq',answer_freq)
        self.classes_dict = self._get_most_frequent_classes(answer_freq, 5)
        # Get categorical abnormality classes
        # self.classes = train20_df.A.unique().tolist()
        self.classes = list(self.classes_dict.keys())
        print('len classes',len(self.classes))
        # if self.n_classes == 330:
        #     if 'no' in self.classes:
        #         self.classes.remove('no')
        #     if 'yes' in self.classes:
        #         self.classes.remove('yes')
        self.classes = sorted(self.classes)
        print('classes',self.classes)
        print('len',len(self.classes))
        # assert len(self.classes) == self.n_classes
        # Remove yes/no classes from train20_df
        # train20_df = pd.DataFrame([
        #     row for row in train20_df.itertuples() if row.A in self.classes
        # ]).drop("Index", axis=1)

        
        # Filter abnormality data from other ImageCLEF datasets
        # print(type(train20_df))
        # display(train20_df)
        train20_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_20_train_qa,
            imgs_path=CNF.paths.clef_20_train_imgs
        )
        val20_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_20_val_qa,
            imgs_path=CNF.paths.clef_20_val_imgs
        )
        print('train20_df',train20_df.shape)
        print('val20_df',val20_df.shape)

        # test19_df = self._make_dataframe(
        #     columns=CNF.clef_cols.test19,
        #     qa_path=CNF.paths.clef_19_test_qa,
        #     imgs_path=CNF.paths.clef_19_test_imgs
        # )
        # print(test19_df.shape)
        # display(test19_df)
        # print('test19_df',test19_df)
        
        # test19_df = test19_df.drop('Task', axis=1)
        test20_df = self._make_dataframe(
            columns=CNF.clef_cols.test20A,
            qa_path=CNF.paths.clef_20_test_qa_sysu,
            imgs_path=CNF.paths.clef_20_test_imgs
        )
        print('test20_df',test20_df.shape)

        val21_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_21_val_qa,
            imgs_path=CNF.paths.clef_21_val_imgs
        )
        print('val21_df',val21_df.shape)
        training_dfs = [train19_df, val20_df, val21_df, test20_df, train20_df]
        self.train_df = pd.concat(
            training_dfs, ignore_index=True
        ).reset_index(drop=True)

        # answer_freq = self._count_answer_freq(self.train_df)
        # print('answer_freq',answer_freq)
        # self.classes = self._get_most_frequent_classes(answer_freq, 1)
        self.train_df['labels'] = self.train_df['A'].apply(lambda x : self.classes_dict[x] if x in self.classes_dict else self.classes_dict['UNKNOWN'])

        self.val_df = self._make_dataframe(
            columns=CNF.clef_cols.default,
            qa_path=CNF.paths.clef_19_val_qa,
            imgs_path=CNF.paths.clef_19_val_imgs,
            is_main_df=True
        )

        self.val_df['labels'] = self.val_df['A'].apply(lambda x : self.classes_dict[x] if x in self.classes_dict else self.classes_dict['UNKNOWN'])
        print('self.val_df',self.val_df.shape)

        self.test19_df = self._make_dataframe(
            columns=CNF.clef_cols.test19,
            qa_path=CNF.paths.clef_19_test_qa,
            imgs_path=CNF.paths.clef_19_test_imgs,
            is_main_df=True
        )
        print('test19_df',self.test19_df.shape)

        # Augmentation
        self.train_tfms = T.Compose([
            T.Resize(size=(CNF.model.inp_size + 8, CNF.model.inp_size + 8)),
            # T.AutoAugment(),
            T.RandomCrop(size=(CNF.model.inp_size, CNF.model.inp_size)),
            T.RandomHorizontalFlip(),
            T.ToTensor(),
            T.Normalize(
                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
            )
        ])
        self.val_tfms = T.Compose([
            T.Resize(size=(CNF.model.inp_size, CNF.model.inp_size)),
            T.ToTensor(),
            T.Normalize(
                mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
            )
        ])
        # Generate new questions for training dataset
        if CNF.data.QG:
            print(f"Before QG: # training samples = {self.train_df.shape[0]}")
            self.train_df = self._generate_questions(self.train_df,
                                                     CNF.task_keywords)
            print(f"After QG: # training samples = {self.train_df.shape[0]}")
        self.tokenizer = Tokenizer.from_list(
            self.train_df.Q.unique().tolist(),
            max_len=CNF.model.max_len
        )
        # Make dataset classes
        self.trainset = VQADataset(
            self.train_df, self.train_tfms, self.classes, self.tokenizer
        )
        self.valset = VQADataset(
            self.val_df, self.val_tfms, self.classes, self.tokenizer
        )
        # Make dataloaders
        self.dls = DataLoaders.from_dataset(
            trainset=self.trainset,
            train_bs=CNF.train.bs,
            valset=self.valset,
            val_bs=1,
            collate_fn=vqa_collate_fn
        )

    def _make_dataframe(self,
                        columns,
                        qa_path,
                        imgs_path,
                        is_main_df=False):
        df = load_qa_file(qa_filepath=qa_path, columns=columns)
        df['PATH'] = df.ID.apply(lambda x: f"{imgs_path}/{x}.jpg")
        if not is_main_df:
            df = pd.DataFrame([
                row for row in df.itertuples() if row.A in self.classes
            ]).drop("Index", axis=1)
        return df
    
    def _count_answer_freq(self,train_df: pd.DataFrame):
        '''
        count the frequence of each unique answer on the dataset
        '''
        all_answers = train_df['A'].values
        answer_freq_dict = defaultdict(int)
        for answer in all_answers:
            answer_freq_dict[answer] += 1
        answer_freq_dict_sort = dict(sorted(answer_freq_dict.items(), key=lambda x: x[1], reverse=True))

        return answer_freq_dict_sort
    
    def _get_most_frequent_classes(self,answer_freq_dict, threshold=1):
        print('threshold',threshold)
        final_classes = defaultdict(int)
        index = 0

        for answer,ans_freq in answer_freq_dict.items():

            if ans_freq >= threshold:
                final_classes[answer]= index         
            else:
                final_classes['UNKNOWN'] = index
                break
            index += 1  
        final_classes['UNKNOWN'] = index
        with open('answer_classes.json', 'w') as fp:
            json.dump(final_classes, fp)
        return final_classes

    # def _generate_questions(
    #     self,
    #     train_df: pd.DataFrame,
    #     task_keywords: Dict[str, List[str]]
    # ) -> pd.DataFrame:
    #     new_train_df = generate_new_questions_dataframe(
    #         train_df, task_keywords
    #     )
    #     new_train_df = new_train_df[new_train_df.Task == 'abnormality']
    #     new_train_df = new_train_df[new_train_df.SubTask == 'categorical']
    #     return new_train_df

    def check(self):
        print(f"Data augmentation:\n\t{self.train_tfms}", end='')
        print(f"\n\t{self.val_tfms}")
        print(f"# training samples = {self.train_df.shape}")
        print(f"# validation samples = {self.val_df.shape}")
        train_pickle_name='train19_subset20_21_df'
        self.train_df.to_pickle('/home/anurag/Med_VQA/train_dataset_pickle/'+train_pickle_name+'.pkl')
        valid_pickle_name = 'val19_df'
        self.val_df.to_pickle('/home/anurag/Med_VQA/valid_dataset_pickle/'+valid_pickle_name+'.pkl')
        test_pickle_name = 'test19_df'
        self.test19_df.to_pickle('/home/anurag/Med_VQA/test_dataset_pickle/'+test_pickle_name+'.pkl')
        if self.n_classes == 330:
            assert self.train_df.shape == (4963, 5)
            assert self.val_df.shape == (472, 5)
        elif self.n_classes == 332:
            if self.QG:
                assert self.train_df.shape == (39704, 6)
            else:
                assert self.train_df.shape == (6583, 4)
            assert self.val_df.shape == (500, 4)
        else:
            _errmsg = f"check not added for classes = {self.n_classes}"
            raise NotImplementedError(_errmsg)
        assert self.train_df.A.nunique() <= self.n_classes
        assert self.val_df.A.nunique() <= self.n_classes
        assert len(self.trainset) == self.train_df.shape[0]
        assert len(self.valset) == self.val_df.shape[0]

        batch = next(iter(self.dls.trainloader))
        print(f"data batch:")
        print(f"\tV = {batch['inputs']['V'].shape}")
        print(f"\tQ = {batch['inputs']['Q']['input_ids'].shape}")
        print(f"\tA = {batch['target'].shape}")
        print("data check: [green]PASSED[/green]")

In [146]:
dm = PrepareCLEF2019DataWithQ(CNF)

train19_df (12792, 4)
unique 1552
answer_freq {'axial': 1558, 'skull and contents': 1216, 'no': 679, 'yes': 615, 'sagittal': 478, 'xr - plain film': 456, 'musculoskeletal': 436, 'coronal': 389, 'gastrointestinal': 352, 'lung, mediastinum, pleura': 250, 'spine and contents': 234, 'genitourinary': 214, 't2': 205, 'ap': 197, 'face, sinuses, and neck': 191, 'us - ultrasound': 183, 'lateral': 151, 't1': 131, 'vascular and lymphatic': 122, 'frontal': 120, 'heart and great vessels': 120, 'noncontrast': 100, 'pa': 92, 'contrast': 87, 'mr - flair': 78, 'an - angiogram': 78, 'transverse': 76, 'ct noncontrast': 71, 'breast': 65, 'flair': 58, 'oblique': 50, 'mr - t2 weighted': 47, 'cta - ct angiography': 45, 'ct with iv contrast': 45, 'longitudinal': 43, 'iv': 42, 'ct w/contrast (iv)': 38, 'mr - t1w - noncontrast': 32, 'mr - dwi diffusion weighted': 30, 'meningioma': 30, 'mr - t1w w/gadolinium': 29, 'glioblastoma multiforme': 28, 'mammograph': 24, 'mra - mr angiography/venography': 24, 'bas - bari

In [147]:
dm.check()

Data augmentation:
	Compose(
    Resize(size=(232, 232), interpolation=bilinear, max_size=None, antialias=warn)
    RandomCrop(size=(224, 224), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
	Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)
# training samples = (14216, 5)
# validation samples = (2000, 5)


AssertionError: 