# Stanford Cars 数据集预处理

数据集树形结构
```
./datasets/
    stanford_cars/
        cars_train.tgz
        cars_train/*.jpg
        cars_test.tgz
        cars_test/*.jpg

        car_devkit.tgz
        devkit/
            cars_meta.mat
            cars_test_annos.mat
            cars_train_annos.mat

        cars_annos.mat
        cars_test_annos_withlabels.mat

        train/
        test/
```

In [None]:
import os
from os.path import join

# 数据集下载相关常量
DOWNLOAD_URL_PREFIX = 'http://ai.stanford.edu/~jkrause/car196'

DATASET_ROOT = "./datasets/"
DATASET_NAME = "stanford_cars"
DATASET_PATH = join(DATASET_ROOT, DATASET_NAME)

## 1. 数据集下载与解压

In [None]:
from torchvision.datasets.utils import download_url, list_dir, list_files
import tarfile

for fname in ["cars_annos.mat", "cars_test_annos_withlabels.mat"]:
    fpath = join(DATASET_PATH, fname)
    if not os.path.exists(fpath):
        url = DOWNLOAD_URL_PREFIX + "/" +  fname
        download_url(url, DATASET_PATH, fname, None)

download_flag = False
for tar_filename in ['cars_train.tgz', 'cars_test.tgz', 'car_devkit.tgz']:
    tar_file_path = join(DATASET_PATH, tar_filename)
    if not os.path.exists(tar_file_path):
        url = DOWNLOAD_URL_PREFIX + '/' + tar_filename
        download_url(url, DATASET_PATH, tar_filename, None)
        print('Extracting downloaded file: ' + tar_file_path)
        download_flag = True

if download_flag:
    with tarfile.open(tar_file_path, 'r') as tar_file:
        tar_file.extractall(DATASET_PATH)


## 2. 训练集 与 测试集 分割

In [None]:
import scipy.io
import shutil
import os
from os.path import join

data = scipy.io.loadmat(join(DATASET_PATH, 'devkit/cars_meta.mat'))
class_names = data['class_names']

label_names = ['',]
for i in range(class_names.shape[1]):
    cname = str(class_names[0,i][0]).replace(' ', '_')
    label_names.append(cname)

stage2file = {
    "train" : "devkit/cars_train_annos.mat",
    "test" : "cars_test_annos_withlabels.mat"
}

for stage, file in stage2file.items():
    file = join(DATASET_PATH, file)
    data = scipy.io.loadmat(file)["annotations"]
    for i in range(data.shape[1]):
        id = int(data[0,i][4])
        fname = data[0,i][5][0]
        folder_path = join(DATASET_PATH, stage, label_names[id])
        if not os.path.isdir(folder_path):
            os.makedirs(folder_path)
        file1 = join(DATASET_PATH, "cars_"+stage, fname)
        file2 = join(folder_path, fname)
        shutil.copy(file1, file2)
    print("%s : %s files" %(stage, data.shape[1]))

