In [1]:
import json
import os
import numpy as np
import pandas as pd
import torch

import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import product
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score, roc_auc_score, balanced_accuracy_score, f1_score, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

from IPython.display import clear_output


def generate_param_configurations(configurations):
    keys, values_list = zip(*configurations.items())
    param_combinations = list(product(*values_list))

    result_configurations = []
    for combination in param_combinations:
        config_dict = dict(zip(keys, combination))
        result_configurations.append(config_dict)

    return result_configurations


In [2]:
import cv2
import sys
import argparse
import numpy as np
import torch
import torch.nn.functional as F
import os
import json

from src.utils import create_cfg, get_catalogs, register_dataset, extract_features
from src.arguments import get_parser
from src.dataloader import get_dataloader

from detectron2.engine import DefaultPredictor
from detectron2.modeling import build_model
from detectron2.structures.boxes import Boxes

In [3]:
##we change the dataset reference folder

parser = get_parser()

parser.add_argument("--model", type=str, required=True)
parser.add_argument("--output", type=str, required=True)


args = parser.parse_args(args=['--dataset-folder', './oral_datas/data/', 
                               '--model', './models/model.pth',
                               '--output', './assets/oral2_feats'])


In [4]:
args.data_augmentation = "none"
args.sampler = "TrainingSampler"

register_dataset(args)
cfg = create_cfg(args)

In [5]:
cfg.MODEL.WEIGHTS = args.model
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.DEVICE='cpu'

predictor = DefaultPredictor(cfg)
model = predictor.model

The checkpoint state_dict contains keys that are not used by the model:
  [35mpixel_mean[0m
  [35mpixel_std[0m


In [6]:
def get_mapper(args, which):
    def train_mapper(dataset_dict):
        dataset_dict = copy.deepcopy(dataset_dict)
        image = utils.read_image(dataset_dict["file_name"], format="BGR")
        
        if args.data_augmentation == "full":
            image, transforms = T.apply_transform_gens([
                T.RandomFlip(),
                T.RandomBrightness(1-args.random_brightness, 1+args.random_brightness),
                T.RandomContrast(1-args.random_contrast, 1+args.random_contrast),
                T.RandomCrop("relative_range", [args.random_crop, 1]),
                T.Resize((800, 800)),
            ], image)
        elif args.data_augmentation == "crop-flip":
            image, transforms = T.apply_transform_gens([
                T.RandomFlip(),
                T.RandomCrop("relative_range", [args.random_crop, 1]),
                T.Resize((800, 800)),
            ], image)
        elif args.data_augmentation == "none":
            image, transforms = T.apply_transform_gens([
                T.Resize((800, 800)),
            ], image)
        else:
            raise Exception("Unknown data augmentation: %s " % args.data_augmentation)

        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))

        annos = [
            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
            for obj in dataset_dict.pop("annotations")
            if obj.get("iscrowd", 0) == 0
        ]
        instances = utils.annotations_to_instances(annos, image.shape[:2])
        dataset_dict["instances"] = utils.filter_empty_instances(instances)

        return dataset_dict

    def coco_eval_mapper(dataset_dict):
        dataset_dict = copy.deepcopy(dataset_dict)
        image = utils.read_image(dataset_dict["file_name"], format="BGR")

        image, transforms = T.apply_transform_gens([
            T.Resize((800, 800)),
        ], image)

        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))

        annos = [
            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
            for obj in dataset_dict.pop("annotations")
            if obj.get("iscrowd", 0) == 0
        ]
        instances = utils.annotations_to_instances(annos, image.shape[:2])
        dataset_dict["instances"] = utils.filter_empty_instances(instances)

        return dataset_dict
    

    def test_mapper(dataset_dict):
        dataset_dict = copy.deepcopy(dataset_dict)
        image = utils.read_image(dataset_dict["file_name"], format="BGR")
        
        image, transforms = T.apply_transform_gens([
            T.Resize((800, 800)),
        ], image)

        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))

        annos = [
            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
            for obj in dataset_dict.pop("annotations")
            if obj.get("iscrowd", 0) == 0
        ]
        instances = utils.annotations_to_instances(annos, image.shape[:2])
        dataset_dict["instances"] = utils.filter_empty_instances(instances)

        return dataset_dict

    if which == "train":
        return train_mapper
    elif which == "test":
        return test_mapper
    elif which == 'val':
        return test_mapper
    elif which == "coco-eval":
        return coco_eval_mapper
    else:
        raise Exception("%s one of train/test/coco-eval" % (which, ))

In [7]:
cfg.DATASETS

CfgNode({'TRAIN': ('train_dataset',), 'PROPOSAL_FILES_TRAIN': (), 'PRECOMPUTED_PROPOSAL_TOPK_TRAIN': 2000, 'TEST': ('test_dataset',), 'PROPOSAL_FILES_TEST': (), 'PRECOMPUTED_PROPOSAL_TOPK_TEST': 1000})

In [8]:
import torch
import copy

from detectron2.data.build import get_detection_dataset_dicts, build_batch_data_loader, build_detection_test_loader
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data import transforms as T
from detectron2.data import detection_utils as utils
from detectron2.data import DatasetMapper
from detectron2.data.samplers import TrainingSampler
from detectron2.data.samplers import RepeatFactorTrainingSampler

In [9]:
def build_detection_loader(cfg, mapper, which, args):
    assert which in ["train", "coco-eval", "test", "val"]
    
    dataset_dicts = get_detection_dataset_dicts(
        cfg.DATASETS.TRAIN if which == "train" else cfg.DATASETS.TEST,
        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
        if cfg.MODEL.KEYPOINT_ON
        else 0,
        proposal_files=None,
    )

    dataset = DatasetFromList(dataset_dicts, copy=False)
    #Compute weights before appling data augmentation
    if args.sampler == "TrainingSampler":
        print("[SAMPLER] Selected TrainingSampler")
        dataset = MapDataset(dataset, mapper)
        sampler = TrainingSampler(len(dataset))
    elif args.sampler == "RepeatFactorTrainingSampler":
        print("[SAMPLER] Selected RepeatFactorTrainingSampler")
        repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(dataset_dicts, args.repeat_factor_th)
        sampler = RepeatFactorTrainingSampler(repeat_factors)
        dataset = MapDataset(dataset, mapper)
    else:
        raise Exception("Unknown Sampler %s" % args.sampler)

    return build_batch_data_loader(
        dataset,
        sampler,
        cfg.SOLVER.IMS_PER_BATCH,
        aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
        num_workers=cfg.DATALOADER.NUM_WORKERS,
    )



In [10]:
mapper = get_mapper(args, 'test')

In [11]:
(build_detection_loader(cfg, mapper, 'test', args))

[SAMPLER] Selected TrainingSampler


<detectron2.data.common.AspectRatioGroupedDataset at 0x7fde34281870>

In [12]:
def extract_dataset(dataset_name, dataset_file, database):
    dataset_size = len(json.load(open(dataset_file))["images"])
    data_loader = get_dataloader(cfg, args, dataset_name)
    extracted = 0
    for batch in data_loader:
        for elem in batch:
            file_name = elem["file_name"].split("/")[-1]
            if file_name not in database:
                database[file_name] = list()
                features = extract_features(model, elem["image"], elem["instances"].gt_boxes.to(model.device))
                for roi, features_vector, type in zip(elem["instances"].gt_boxes, features, elem["instances"].gt_classes):
                    database[file_name].append(dict(
                        roi=roi.tolist(),
                        features=features_vector.tolist(),
                        type=type.item()
                    ))
                extracted += 1
                print("Extracted: %s  progress: %.3f%%" % (elem["file_name"], 100*extracted/dataset_size))
            if extracted == dataset_size:
                return

In [13]:
train_database = dict()
test_database = dict()

In [14]:
for dataset_name, dataset_file, database in zip(["train", "test"], 
                                                ["./oral_datas/data/train.json", "./oral_datas/data/test.json"], #analogous for validation
                                                [train_database, test_database]):
    print("Extrating dataset: %s" % (dataset_name, ))
    extract_dataset(dataset_name, dataset_file, database)

clear_output()


In [15]:
json.dump(train_database, open(os.path.join('./oral_datas/data', "oral2_train_features.json"), "w"))
json.dump(test_database, open(os.path.join('./oral_datas/data', "oral2_test_features.json"), "w")) #analogous for validation

In [12]:
train_file = open('datasets/lesions/train.json')
train_data = json.load(train_file)
train_file.close()


test_file = open('datasets/lesions/test.json')
test_data = json.load(test_file)
test_file.close()

In [13]:
train_data

{'images': [{'id': 1732,
   'dataset_id': 2,
   'category_ids': [],
   'path': '/datasets/oral1/8cc77a9f9081530e6d9b49ec472d934c.jpg',
   'width': 2816,
   'height': 2112,
   'file_name': '8cc77a9f9081530e6d9b49ec472d934c.jpg',
   'annotated': False,
   'annotating': [],
   'num_annotations': 0,
   'metadata': {},
   'deleted': False,
   'milliseconds': 0,
   'events': [],
   'regenerate_thumbnail': False},
  {'id': 1651,
   'dataset_id': 2,
   'category_ids': [],
   'path': '/datasets/oral1/d4777a6154b22adf50bcde578aa856e9.jpg',
   'width': 2816,
   'height': 2112,
   'file_name': 'd4777a6154b22adf50bcde578aa856e9.jpg',
   'annotated': False,
   'annotating': [],
   'num_annotations': 0,
   'metadata': {},
   'deleted': False,
   'milliseconds': 0,
   'events': [],
   'regenerate_thumbnail': False},
  {'id': 2406,
   'dataset_id': 2,
   'category_ids': [],
   'path': '/datasets/oral1/DSCN3343 (1).jpg',
   'width': 2816,
   'height': 2112,
   'file_name': 'DSCN3343 (1).jpg',
   'annota

In [20]:
len(train_data['annotations']) + len(test_data['annotations'])

499

In [16]:
dataset = train_data
category_count = dict()
for annotation in dataset["annotations"]:
    if annotation["category_id"] not in category_count:
        category_count[annotation["category_id"]] = 0
    category_count[annotation["category_id"]] += 1

for category in dataset["categories"]:
    print("-%s: %d" % (category["name"], category_count[category["id"]]))


dataset = test_data
category_count = dict()
for annotation in dataset["annotations"]:
    if annotation["category_id"] not in category_count:
        category_count[annotation["category_id"]] = 0
    category_count[annotation["category_id"]] += 1

for category in dataset["categories"]:
    print("-%s: %d" % (category["name"], category_count[category["id"]]))
    

-neoplastic: 125
-aphthous: 143
-traumatic: 144
-neoplastic: 26
-aphthous: 31
-traumatic: 30
