# Imagenet fetch

Clone [Imagenet](https://huggingface.co/datasets/imagenet-1k).
Requires Huggingface token and Git LFS.

```bash
huggingface-cli login
cd ~/Downloads
git clone https://huggingface.co/datasets/imagenet-1k
```

In [1]:
import os

git_dir = f"{os.environ['HOME']}/Downloads/imagenet-1k"
out_dir = "data/imagenet-1k"

In [2]:
from pathlib import Path
import shutil
import glob

git_dir = Path(git_dir)
out_dir = Path(out_dir)
splits = ["test", "train", "val"]

out_dir.mkdir(parents=True, exist_ok=True)
for split in splits:
    out_dir.joinpath(split).mkdir(exist_ok=True)

In [3]:
def get_files(split: str):
    files = [
        *glob.glob(f"{git_dir}/data/{split}_images.tar.gz"),
        *glob.glob(f"{git_dir}/data/{split}_images_[0-9].tar.gz"),
    ]
    files.sort()
    return files

In [4]:
def transform_name(name: str):
    dots = name.split(".")
    base, ext = ".".join(dots[:-1]), dots[-1]
    bars = base.split("_")
    name, label = f"{'_'.join(bars[:-1])}.{ext}", bars[-1]
    return name, label

In [5]:
shutil.copy(git_dir.joinpath("classes.py"), out_dir.joinpath("classes.py"))

PosixPath('data/imagenet-1k/classes.py')

In [6]:
import tarfile

from tqdm import tqdm

for split in ["train", "val"]:
    for filename in get_files(split):
        print(f"Reading `{filename}`")
        with tarfile.open(filename, "r:gz") as tar:
            for member in tqdm(tar.getmembers()):
                name, label = transform_name(member.name)
                label_dir = out_dir.joinpath(split, label)
                label_dir.mkdir(exist_ok=True)
                with tar.extractfile(member) as extract:
                    with label_dir.joinpath(name).open(mode="wb") as output:
                        output.write(extract.read())


Reading `/Users/marakim/Downloads/imagenet-1k/data/train_images_0.tar.gz`


100%|██████████| 256234/256234 [01:49<00:00, 2339.82it/s]


Reading `/Users/marakim/Downloads/imagenet-1k/data/train_images_1.tar.gz`


100%|██████████| 256234/256234 [01:59<00:00, 2136.70it/s]


Reading `/Users/marakim/Downloads/imagenet-1k/data/train_images_2.tar.gz`


100%|██████████| 256234/256234 [02:03<00:00, 2072.27it/s]


Reading `/Users/marakim/Downloads/imagenet-1k/data/train_images_3.tar.gz`


100%|██████████| 256234/256234 [02:05<00:00, 2046.46it/s]


Reading `/Users/marakim/Downloads/imagenet-1k/data/train_images_4.tar.gz`


100%|██████████| 256231/256231 [02:27<00:00, 1742.73it/s]


Reading `/Users/marakim/Downloads/imagenet-1k/data/val_images.tar.gz`


100%|██████████| 50000/50000 [00:21<00:00, 2304.01it/s]
