In [5]:
! pip install lmdb 
! pip install pyarrow
! pip install ffcv

Collecting lmdb
  Downloading lmdb-1.3.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (305 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.9/305.9 KB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lmdb
Successfully installed lmdb-1.3.0
Collecting pyarrow
  Downloading pyarrow-7.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-7.0.0


In [1]:
import torch
from torchvision import datasets, models, transforms
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import os
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
import multiprocessing
import pyarrow as pa
import lmdb
import numpy as np
num_workers = multiprocessing.cpu_count()

In [2]:
input_path = "temp/insect_25/"

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [5]:
model = models.resnet50(pretrained=True).to(device)
    
for param in model.parameters():
    param.requires_grad = False   
    
model.fc = nn.Sequential(
               nn.Linear(2048, 128),
               nn.ReLU(inplace=True),
               nn.Linear(128, 25)).to(device)

In [3]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

data_transforms = {
    'train':
    transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ]),
    'validation':
    transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        normalize
    ]),
}

image_datasets = {
    'train': 
    datasets.ImageFolder(input_path + 'train', data_transforms['train']),
    'validation': 
    datasets.ImageFolder(input_path + 'val', data_transforms['validation'])
}

dataloaders = {
    'train':
    torch.utils.data.DataLoader(image_datasets['train'],
                                batch_size=256,
                                shuffle=True,
                                num_workers=multiprocessing.cpu_count()),  # for Kaggle
    'validation':
    torch.utils.data.DataLoader(image_datasets['validation'],
                                batch_size=256,
                                shuffle=False,
                                num_workers=multiprocessing.cpu_count())  # for Kaggle
}

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters())

In [7]:
num_epochs = 5

In [8]:
import time

### create lmdb

In [9]:
import multiprocessing
import pyarrow as pa
import lmdb
num_workers = multiprocessing.cpu_count()

In [10]:
def raw_reader(path):
    with open(path, 'rb') as f:
        bin_data = f.read()
    return bin_data


def dumps_pyarrow(obj):
    """
    Serialize an object.
    Returns:
        Implementation-dependent bytes-like object
    """
    return pa.serialize(obj).to_buffer()

def folder2lmdb(path, outpath, write_frequency=5000):
    directory = os.path.expanduser(path)
    print("Loading dataset from %s" % directory)
    dataset = ImageFolder(directory, loader=raw_reader)
    data_loader = DataLoader(dataset, collate_fn=lambda x: x,num_workers=num_workers)
    lmdb_path = os.path.expanduser(outpath)
    isdir = os.path.isdir(lmdb_path)

    print("Generate LMDB to %s" % lmdb_path)
    db = lmdb.open(lmdb_path, subdir=isdir,
                   map_size=1099511627776 * 2, readonly=False,
                   meminit=False, map_async=True)

    txn = db.begin(write=True)
    for idx, data in enumerate(data_loader):
        image, label = data[0]
        txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow((image, label)))
        if idx % write_frequency == 0:
            print("[%d/%d]" % (idx, len(data_loader)))
            txn.commit()
            txn = db.begin(write=True)
    print("LABEL:", label)
    # finish iterating through dataset
    txn.commit()
    keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
    with db.begin(write=True) as txn:
        txn.put(b'__keys__', dumps_pyarrow(keys))
        txn.put(b'__len__', dumps_pyarrow(len(keys)))

    print("Flushing database ...")
    db.sync()
    db.close()

In [11]:
class ImageFolderLMDB(Dataset):
    def __init__(self, db_path, transform=None, target_transform=None):
        self.db_path = db_path
        self.env = lmdb.open(db_path, subdir=os.path.isdir(db_path),
                             readonly=True, lock=False,
                             readahead=False, meminit=False)
        with self.env.begin(write=False) as txn:
            self.length = pa.deserialize(txn.get(b'__len__'))
            self.keys = pa.deserialize(txn.get(b'__keys__'))

        self.transform = transform
#         print(self.transform)
        self.target_transform = target_transform

    def __getitem__(self, index):
#         print(index)
        img, target = None, None
        env = self.env
        with env.begin(write=False) as txn:
            byteflow = txn.get(self.keys[index])
        unpacked = pa.deserialize(byteflow)

        # load image
        imgbuf = unpacked[0]
        buf = six.BytesIO()
        buf.write(imgbuf)
        buf.seek(0)
        img = Image.open(buf).convert('RGB')
        img= np.array(img)
        # load label
        target = unpacked[1]

        if self.transform is not None:
            img = self.transform(image = img)["image"]

        if self.target_transform is not None:
            target = self.target_transform(target)
#         print(type(img),img.numpy().shape)
        return img, target

    def __len__(self):
        return self.length

    def __repr__(self):
        return self.__class__.__name__ + ' (' + self.db_path + ')'

In [12]:
train_dir  = os.path.join("temp/insect_25/train")
train_db_path  = "temp/insect_25/train-lmdb"
val_dir  = os.path.join("temp/insect_25/val")
val_db_path  = "temp/insect_25/val-lmdb"

In [14]:
%%time
folder2lmdb(train_dir, train_db_path)
folder2lmdb(val_dir, val_db_path)

Loading dataset from temp/insect_25/train
Generate LMDB to temp/insect_25/train-lmdb


  if __name__ == '__main__':


[0/6349]


  if __name__ == '__main__':


[5000/6349]
LABEL: 24
Flushing database ...
Loading dataset from temp/insect_25/val
Generate LMDB to temp/insect_25/val-lmdb


  from ipykernel import kernelapp as app


[0/714]
LABEL: 24
Flushing database ...
CPU times: user 3.6 s, sys: 2.38 s, total: 5.97 s
Wall time: 3min 46s


In [13]:
train_dataset = ImageFolderLMDB(
        train_db_path,
           data_transforms['train'])

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
val_dataset = ImageFolderLMDB(
        val_db_path,
        data_transforms['validation'])

In [15]:
train_loader = torch.utils.data.DataLoader(
#         train_dataset, batch_size=16*torch.cuda.device_count(), shuffle=True,
        train_dataset, batch_size=256,shuffle=False,num_workers=multiprocessing.cpu_count())
        #, pin_memory=T

In [16]:
val_loader = torch.utils.data.DataLoader(
#         val_dataset, batch_size=8*torch.cuda.device_count(),shuffle=True,


In [17]:
for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch+1, num_epochs))
#     print('-' * 10)
    start = time.time()
    
    for phase in ['train', 'validation']:
        if phase == 'train':
            model.train()
        else:
            model.eval()

        running_loss = 0.0
        running_corrects = 0
    
        for inputs, labels in dataloaders[phase]:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            if phase == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(image_datasets[phase])
        epoch_acc = running_corrects.double() / len(image_datasets[phase])
    print("Time taken in 1 epoch", time.time()-start, "seconds.")

Epoch 1/5




Time taken in 1 epoch 47.67885661125183 seconds.
Epoch 2/5




Time taken in 1 epoch 48.91055727005005 seconds.
Epoch 3/5




Time taken in 1 epoch 51.61034798622131 seconds.
Epoch 4/5




Time taken in 1 epoch 53.74762797355652 seconds.
Epoch 5/5




Time taken in 1 epoch 55.84217572212219 seconds.


#### using ffcv 

In [9]:
from ffcv.pipeline.operation import Operation
from ffcv.loader import Loader, OrderOption
from ffcv.transforms import ToTensor, ToDevice, Squeeze, NormalizeImage, \
    RandomHorizontalFlip, ToTorchImage
from ffcv.fields.rgb_image import CenterCropRGBImageDecoder, \
    RandomResizedCropRGBImageDecoder,ResizedCropRGBImageDecoder
from ffcv.fields.basics import IntDecoder

In [10]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

data_transforms = {
    'train':
    transforms.Compose([
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ]),
    'validation':
    transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        normalize
    ]),
}

In [11]:
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406]) * 255
IMAGENET_STD = np.array([0.229, 0.224, 0.225]) * 255
DEFAULT_CROP_RATIO = 224/256

### create Dataset

In [12]:
from ffcv.writer import DatasetWriter
from ffcv.fields import RGBImageField, IntField

# Your dataset (`torch.utils.data.Dataset`) of (image, label) pairs
import multiprocessing
num_workers = multiprocessing.cpu_count()

In [13]:
def save_beton_file(write_path,my_dataset):
    # Pass a type for each data field
    writer = DatasetWriter(write_path, {
        # Tune options to optimize dataset size, throughput at train-time
        'image': RGBImageField(max_resolution=256, jpeg_quality=100),
        'label': IntField()
    },num_workers=num_workers)

    # Write dataset
    writer.from_indexed_dataset(my_dataset)

In [12]:
train_loader_path = "temp/insect_25/train.beton"
val_loader_path = "temp/insect_25/val.beton"
batch_size=256
batch_size

256

In [13]:
train_directory = "temp/insect_25/train"
train_dataset= ImageFolder(train_directory)
val_directory = "temp/insect_25/val"
val_dataset= ImageFolder(val_directory)

In [17]:
save_beton_file(val_loader_path,val_dataset)
save_beton_file(train_loader_path,train_dataset)

100%|██████████| 714/714 [00:19<00:00, 37.51it/s]
100%|██████████| 6349/6349 [01:32<00:00, 68.80it/s] 


In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # this changes in ddp


256

In [19]:
size = (224, 224)
decoder = CenterCropRGBImageDecoder(size, ratio=DEFAULT_CROP_RATIO)

# Data decoding and augmentation
train_image_pipeline = [decoder,RandomHorizontalFlip(), ToTensor(), ToDevice(device, non_blocking=True),ToTorchImage(),
            NormalizeImage(IMAGENET_MEAN, IMAGENET_STD, np.float32)]
label_pipeline = [IntDecoder(), ToTensor(), ToDevice(0)]
val_image_pipeline = [decoder, ToTensor(), ToDevice(device, non_blocking=True),ToTorchImage(),
            NormalizeImage(IMAGENET_MEAN, IMAGENET_STD, np.float32)]
# Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
train_loader = Loader(train_loader_path, batch_size=batch_size, num_workers=num_workers,
                order=OrderOption.RANDOM, pipelines={
    'image': train_image_pipeline,
    'label': label_pipeline
}
)
val_loader = Loader(val_loader_path, batch_size=batch_size, num_workers=num_workers,
                order=OrderOption.RANDOM, pipelines={
    'image': val_image_pipeline,
    'label': label_pipeline
}
)

In [35]:
epochs = 5
import time
# batchwiselogs= open(base_save_folder+"/500-insectsb4-batchwiselogs.txt","w")
# epoch_timings =  open(base_save_folder+"/500-insectsb4-epochwiselogs.txt","w")
best_accuracy=0
epoch_txts=[]
best_epoch_accuracy =0.0


In [None]:
for epoch in range(epochs):
    logs = {}
    start = time.time()
    print("epoch:",epoch)
    epoch_loss = 0
    epoch_accuracy = 0
    batch_number =0 
#     t1 = time.time()
    model.train()
#     print(epoch_txt)
    start = time.time()
    for i, (data, label) in enumerate(train_loader):
#         print(data,label)
        batch_time_start = time.time()
        batch_number+=1
        data = data.to(device,dtype=torch.float)
        label = label.to(device,dtype=torch.int64)


        output = model(data)
        label_ = torch.squeeze(label)
#         print("label.shape,data.shape,output[0].shape,label_.shape: ",label.shape,data.shape,output[0].shape,label_.shape)
#         input()
#         import pdb 
#         pdb.set_trace()
        loss = criterion(output, label_)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)
        if epoch_accuracy>best_epoch_accuracy:
            best_epoch_accuracy = epoch_accuracy
        time_taken = time.time() - batch_time_start
        text_to_write = "Batch number: "+ str(batch_number)+", time_taken: "+str(time_taken)+"\n"
#         print(text_to_write)
#         batchwiselogs.write(text_to_write)
#         t1= time.time()
        
    logs['training_loss'] = epoch_loss.item()
    logs['training_accuracy'] = epoch_accuracy.item()
# #     print(logs)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        model.eval()
        for data, label in val_loader:
            data = data.to(device)
            label = label.to(device)

            val_output = model(data)
            label_ = torch.squeeze(label)
            val_loss = criterion(val_output, label_)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            
            epoch_val_accuracy += acc / len(val_loader)
            epoch_val_loss += val_loss / len(val_loader)
            if epoch_val_accuracy > best_accuracy :
                best_accuracy = epoch_val_accuracy
                if not os.path.exists("artifacts"):
                    os.makedirs("artifacts")
#                 torch.save(model.state_dict(), base_save_folder+'/epoch_'+str(epoch)+'_accuracy_'+str(epoch_val_accuracy)+'.pt')
    print("Time_taken for 1 epoch:",time.time()-start)
#     scheduler.step(epoch_val_loss)
    logs['validation_loss'] = epoch_val_loss.item()
    logs['validation_accuracy'] = epoch_val_accuracy.item()
    


epoch: 0
Time_taken for 1 epoch: 21.419193267822266
epoch: 1
Time_taken for 1 epoch: 21.823665380477905
epoch: 2
