    # dataset
我比较喜欢不同数据集加载和总数据集写进不同文件里。在这个文件夹里面我应该会尝试一些常见的数据集和其加载的形式。这里暂时以UperNet为基础讲解数据集的加载方式

## unifiedparsing
在这里数据加载主要分为三个大模块: dataset \ joint_dataset \ adeseg etc。方便插入不同的数据集。但是大方法一致

### joint_dataset
collections.OrderedDict([items]): return an instance of a dict subclass that has methods specialized for rearranging dictionary order

### 包含的文件加载方式
- csv
- mat
- json
- txt

In [1]:
import json

import torch

### trainDataset
列出traindataset 的書寫格式和常用模塊手法


In [None]:
class ConcatDataset(Dataset):
    """
    Dataset to concatenate multiple datasets.
    Purpose: useful to assemble different existing datasets, possibly
    large-scale datasets as the concatenation operation is done in an
    on-the-fly manner.

    Arguments:
        datasets (iterable): List of datasets to be concatenated
    """

    @staticmethod
    def cumsum(sequence):
        r, s = [], 0
        for e in sequence:
            l = len(e)
            r.append(l + s)
            s += l
        return r

    def __init__(self, datasets):
        super(ConcatDataset, self).__init__()
        assert len(datasets) > 0, 'datasets should not be an empty iterable'
        self.datasets = list(datasets)
        self.cumulative_sizes = self.cumsum(self.datasets)

    def __len__(self):
        return self.cumulative_sizes[-1]

    def __getitem__(self, idx):
        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
        if dataset_idx == 0:
            sample_idx = idx
        else:
            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
        return self.datasets[dataset_idx][sample_idx]

    @property
    def cummulative_sizes(self):
        warnings.warn("cummulative_sizes attribute is renamed to "
                      "cumulative_sizes", DeprecationWarning, stacklevel=2)
        return self.cumulative_sizes

class Dataset(object):
    def __getitem__(self, index):
        raise NotImplementedError
    
    def __len__(self):
        raise NotImplementedError
    
    def __add__(self, other):
        return ConcatDataset([self, other])

### mat文件的读取和使用方法
mat文件会包含一个前置包括matlab版本，平台，创建时间以及版本等变量。正文在mat['index']里面

In [1]:
import os
import numpy
from scipy.io import loadmat

In [2]:
mat = loadmat('C:/data/datasets/ADE20K/ADE20K_2021_17_01/index_ade20k.mat')

In [4]:
type(mat)

dict

In [5]:
print(mat)

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Mon Jan 18 17:52:43 2021', '__version__': '1.0', '__globals__': [], 'index': array([[(array([[array(['ADE_train_00000001.jpg'], dtype='<U22'),
                array(['ADE_train_00000002.jpg'], dtype='<U22'),
                array(['ADE_train_00000003.jpg'], dtype='<U22'), ...,
                array(['ADE_frame_00000314.jpg'], dtype='<U22'),
                array(['ADE_frame_00000315.jpg'], dtype='<U22'),
                array(['ADE_frame_00000316.jpg'], dtype='<U22')]], dtype=object), array([[array(['ADE20K_2021_17_01/images/ADE/training/transportation/airport_terminal'],
                      dtype='<U69')                                                             ,
                array(['ADE20K_2021_17_01/images/ADE/training/transportation/airport_terminal'],
                      dtype='<U69')                                                             ,
                array(['ADE20K_2021_17_01/images/ADE/trainin

In [5]:
index = mat['index']

In [7]:
print(index)

[[(array([[array(['ADE_train_00000001.jpg'], dtype='<U22'),
          array(['ADE_train_00000002.jpg'], dtype='<U22'),
          array(['ADE_train_00000003.jpg'], dtype='<U22'), ...,
          array(['ADE_frame_00000314.jpg'], dtype='<U22'),
          array(['ADE_frame_00000315.jpg'], dtype='<U22'),
          array(['ADE_frame_00000316.jpg'], dtype='<U22')]], dtype=object), array([[array(['ADE20K_2021_17_01/images/ADE/training/transportation/airport_terminal'],
                dtype='<U69')                                                             ,
          array(['ADE20K_2021_17_01/images/ADE/training/transportation/airport_terminal'],
                dtype='<U69')                                                             ,
          array(['ADE20K_2021_17_01/images/ADE/training/cultural/art_gallery'],
                dtype='<U58')                                                  ,
          ...,
          array(['ADE20K_2021_17_01/images/ADE/training/industrial/wind_farm'],
   

In [6]:
index.dtype

dtype([('filename', 'O'), ('folder', 'O'), ('typeset', 'O'), ('objectIsPart', 'O'), ('objectPresence', 'O'), ('objectcounts', 'O'), ('objectnames', 'O'), ('proportionClassIsPart', 'O'), ('scene', 'O'), ('wordnet_found', 'O'), ('wordnet_level1', 'O'), ('wordnet_synset', 'O'), ('wordnet_hypernym', 'O'), ('wordnet_gloss', 'O'), ('wordnet_synonyms', 'O'), ('wordnet_frequency', 'O')])

In [9]:
index.size

1

In [10]:
index.shape

(1, 1)

In [11]:
index[0,0]['filename']

array([[array(['ADE_train_00000001.jpg'], dtype='<U22'),
        array(['ADE_train_00000002.jpg'], dtype='<U22'),
        array(['ADE_train_00000003.jpg'], dtype='<U22'), ...,
        array(['ADE_frame_00000314.jpg'], dtype='<U22'),
        array(['ADE_frame_00000315.jpg'], dtype='<U22'),
        array(['ADE_frame_00000316.jpg'], dtype='<U22')]], dtype=object)

In [12]:
index['filename'][0,0]

array([[array(['ADE_train_00000001.jpg'], dtype='<U22'),
        array(['ADE_train_00000002.jpg'], dtype='<U22'),
        array(['ADE_train_00000003.jpg'], dtype='<U22'), ...,
        array(['ADE_frame_00000314.jpg'], dtype='<U22'),
        array(['ADE_frame_00000315.jpg'], dtype='<U22'),
        array(['ADE_frame_00000316.jpg'], dtype='<U22')]], dtype=object)

In [17]:
from collections import namedtuple
Ade20kIndex = namedtuple('Ade20kIndex', index.dtype.names)
print(Ade20kIndex)

<class '__main__.Ade20kIndex'>


In [10]:
print(index.dtype.names)

('filename', 'folder', 'typeset', 'objectIsPart', 'objectPresence', 'objectcounts', 'objectnames', 'proportionClassIsPart', 'scene', 'wordnet_found', 'wordnet_level1', 'wordnet_synset', 'wordnet_hypernym', 'wordnet_gloss', 'wordnet_synonyms', 'wordnet_frequency')


In [12]:
print(Ade20kIndex.folder)

_tuplegetter(1, 'Alias for field number 1')


In [13]:
print(Ade20kIndex[1])

__main__.Ade20kIndex[1]


In [18]:
index_ = Ade20kIndex(
            **{name: index[name][()] for name in index.dtype.names})

In [17]:
print(index_.filename[1])
print(index_.folder[1])

IndexError: index 1 is out of bounds for axis 0 with size 1

### txt文件的读取

In [13]:
with open("C:/github/unifiedparsing/meta_file/ade20k/scene_categories.txt", 'r') as f:
    lines = f.readlines()

In [14]:
print(lines)

['ADE_train_00000001 airport_terminal\n', 'ADE_train_00000002 airport_terminal\n', 'ADE_train_00000003 art_gallery\n', 'ADE_train_00000004 badlands\n', 'ADE_train_00000005 ball_pit\n', 'ADE_train_00000006 bathroom\n', 'ADE_train_00000007 bathroom\n', 'ADE_train_00000008 bathroom\n', 'ADE_train_00000009 bathroom\n', 'ADE_train_00000010 bathroom\n', 'ADE_train_00000011 bathroom\n', 'ADE_train_00000012 bathroom\n', 'ADE_train_00000013 bathroom\n', 'ADE_train_00000014 bathroom\n', 'ADE_train_00000015 bathroom\n', 'ADE_train_00000016 bathroom\n', 'ADE_train_00000017 bathroom\n', 'ADE_train_00000018 bathroom\n', 'ADE_train_00000019 bathroom\n', 'ADE_train_00000020 bathroom\n', 'ADE_train_00000021 bathroom\n', 'ADE_train_00000022 bathroom\n', 'ADE_train_00000023 bathroom\n', 'ADE_train_00000024 bathroom\n', 'ADE_train_00000025 bathroom\n', 'ADE_train_00000026 bathroom\n', 'ADE_train_00000027 bathroom\n', 'ADE_train_00000028 bathroom\n', 'ADE_train_00000029 bathroom\n', 'ADE_train_00000030 bat

In [15]:
type(lines)

list

In [16]:
len(lines)

22210

In [19]:
index_scene = []
for i, l in enumerate(lines):
    l = l.split(" ")
    filename, scene_label = l[0], l[1].replace('\n', ' ')
    if scene_label =='misc':
        scene_label = '-'
    assert filename +'.jpg' == index_.filename[i]
    index_scene.append(scene_label)
print(index_scene)

  assert filename +'.jpg' == index_.filename[i]


AssertionError: 

### json文件的读取及使用

In [2]:
with open('C:/github/takehome_challenge/dataset/08. song.json') as json_file:
    song = json.load(json_file)
print(song)

[{'id': 'GOQMMKSQQH', 'user_id': 122, 'user_state': 'Louisiana', 'user_sign_up_date': '2015-05-16', 'song_played': 'Hey Jude', 'time_played': '2015-06-11 21:51:35'}, {'id': 'HWKKBQKNWI', 'user_id': 3, 'user_state': 'Ohio', 'user_sign_up_date': '2015-05-01', 'song_played': 'We Can Work It Out', 'time_played': '2015-06-06 16:49:19'}, {'id': 'DKQSXVNJDH', 'user_id': 35, 'user_state': 'New Jersey', 'user_sign_up_date': '2015-05-04', 'song_played': 'Back In the U.S.S.R.', 'time_played': '2015-06-14 02:11:29'}, {'id': 'HLHRIDQTUW', 'user_id': 126, 'user_state': 'Illinois', 'user_sign_up_date': '2015-05-16', 'song_played': 'P.s. I Love You', 'time_played': '2015-06-08 12:26:10'}, {'id': 'SUKJCSBCYW', 'user_id': 6, 'user_state': 'New Jersey', 'user_sign_up_date': '2015-05-01', 'song_played': "Sgt. Pepper's Lonely Hearts Club Band", 'time_played': '2015-06-28 14:57:00'}, {'id': 'XYDGPXHKLI', 'user_id': 147, 'user_state': 'Texas', 'user_sign_up_date': '2015-05-18', 'song_played': 'Sgt. Pepper In

### GraphOOD-GNNSafe
这个GNN网络的dataset是基于torch里面封装好的dataset的，只需要执行调用即可

TypeError: list indices must be integers or slices, not str

### UperNet
UperNet是做场景分割的一个非常有用的参考，可以读一下了解到文字都是怎么输入进去的
可以从输出来进行理解，输出字典包括img\seg_object\valid_object\seg_part\valid_part\scene_label\seg_material\valid_material\source_idx
sorce_idx: 每个输出样本的唯一标识
seg_material: 模型判断输出的物品材质
valid_material: 标签物品材质 这个对于单纯用ade_seg来说用处不大

transformers.compose: Composes several transforms together. -> container
transforms.Normalize: Normalize a tensor image with mean and standard deviation. x = (x-mean)/std

In [None]:
from torchvision import transforms
import numpy as np

In [1]:
class TrainDataset():
    def __init__(self, records, source_idx, args, max_sample=-1, batch_per_gpu=1):
        self.imgSize = args.imgSize
        self.imgMaxSize = args.imgMaxSize
        self.random_flip = args.ramdom_flip
        # 最大化下采样率避免在卷积和池化层中舍入
        self.padding_constanct = args.padding_constant
        # 下采样率和gpu设置
        self.segm_downsampling_rate = args.segm_downsampling_rate
        self.batch_per_gpu = batch_per_gpu

        # 按照图片格式分为两类: 1. h>w, 2. h<=w
        self.batch_record_list = [[], []]

        # 当使用多张gpu进行训练时重构数据集代码
        self.cur_idx = 0

        # mean and std
        self.img_transform = transforms.Compose([
            transforms.Normalize(mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.])
        ])

        self.list_sample = records
        self.source_idx = source_idx

        self.if_shuffled = False
        # 当max_sample>0时 num_sample = list_sample = max_sample
        # max_sample=-1时 num_sample = list_sample = len(records)
        if max_sample > 0:
            self.list_sample = self.list_sample[0:max_sample]
        self.num_sample = len(self.list_sample)
        assert self.num_sample > 0
        print('# samples: {}'.format(self.num_sample))

    def _get_sub_batch(self):
        while True:
            this_sample = self.list_sample[self.cur_idx]
        # 按照 1. h>w 2. h<=w 分为两类
            if this_sample['height'] > this_sample['width']:
                self.batch_record_list[0].append(this_sample)
            else:
                self.batch_record_list[1].append(this_sample)

            self.cur_idx += 1
            if self.cur_idx >= self.num_sample:
                self.cur_idx = 0
                np.random.shuffle(self.list_sample)

            if len(self.batch_record_list[0]) == self.batch_per_gpu:
                batch_records = self.batch_record_list[0]
                self.batch_record_list[0] = []
                break
            elif len(self.batch_record_list[1]) == self.batch_per_gpu:
                batch_records = self.batch_record_list[1]
                self.batch_record_list[1] = []
                break
        return batch_records

    def __getitem__(self, index):
        if not self.if_shuffled:
            np.random.shuffle(self.list_sample)
            self.if_shuffled = True

        # get sub-batch 为什么要使用这个
        batch_records = self._get_sub_batch()

        # resize
        if isinstance(self.imgSize, list):
            this_short_size = np.random.choice(self.imgSize)
        else:
            this_short_size = self.imgSize

        # calculate the batch's height and width
        batch_resized_size = np.zeros((self.batch_per_gpu, 2), np.int32)
        for i in range(self.batch_per_gpu):
            img_height, img_width = batch_records[i]['height'], batch_records[i]['width']
            this_scale = min(this_short_size / min(img_height, img_width), self.imgSize / max(img_height, img_width))
            img_resized_height, img_resized_width = img_height * this_scale, img_width * this_scale
            batch_resized_size[i, :] = img_resized_height, img_resized_width
        batch_resized_height = np.max(batch_resized_size[:, 0])
        batch_resized_width = np.max(batch_resized_size[:, 1])



SyntaxError: 'break' outside loop (Temp/ipykernel_30584/1908040878.py, line 51)

In [None]:
class ValDataset():
    def __init__(self, records, args, max_sample=-1, start_idx=-1 ,end_idx=-1):
        self.imgSize = args.imgSize
        self.imgMaxSize = args.imgMaxSize
        self.padding_constant = args.padding_constant

        # mean initializes
        self.img_transform = transforms.Compose([
            transforms.Normalize(mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.])
        ])
        self.list_sample = records

        if max_sample > 0:
            self.list_sample = self.list_sample[0 : max_sample]

        if start_idx >= 0 and end_idx >=0:
            self.list_sample = self.list_sample[start_idx:end_idx]

        self.num_sample = len(self.list_sample)
        assert self.num_sample > 0
        print('# samples: {}'.format(self.num_sample))

    def __getitem__(self, index):
        data =

In [1]:
# string.rstrip()
string = "geekssss"
print(string.rstrip('s'))
print(string.rstrip())

geek
geekssss


In [None]:
from scipy.misc import imread, imresize

In [None]:
class TestDataset():
    def __init__(self, args, odgt, max_sample=-1):
        self.imgSize = args.imgSize
        self.imgMaxSize = args.imgMaxSize
        self.padding_constant = args.padding_cosntant
        self.segm_downsampling_rate = args.segm_downsampling_rate

        self.img_transform = transforms.Compose([
            transforms.Normalize(mean=[102.9801, 115.9465, 122.7717], std=[1., 1., 1.])
        ])

        # 判断输入数据类型，list用一种输入方法，str用另外一种输入方法
        if isinstance(odgt, list):
            self.list_sample = odgt
        elif isinstance(odgt, str):
            # 打开并删出尾随空格
            self.list_sample = [json.loads(x.rstrip()) for x in open(odgt, 'r')]

        if max_sample > 0:
            self.list_sample = self.list_sample[0:max_sample]
        self.num_sample = len(self.list_sample)
        assert self.num_sample > 0
        print('# samples: {}'.format(self.num_sample))

    def __getitem__(self, index):
        this_record = self.list_sample[index]
        image_path = this_record['fpath_img']
        img = imread(image_path, mode='RGB')
        img = img[:, :, ::-1]