In [1]:
import torch 
import torchvision
import os
import torch.nn as nn
from torchvision import transforms, datasets
import math
import time
from tqdm import tqdm
import shutil
import matplotlib.pyplot as plt


In [2]:
!pip install ofa

Collecting ofa
  Downloading ofa-0.1.0.post202307202001-py3-none-any.whl.metadata (1.4 kB)
Downloading ofa-0.1.0.post202307202001-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ofa
Successfully installed ofa-0.1.0.post202307202001


In [3]:
cuda_available = torch.cuda.is_available()
if cuda_available:
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    print('Using GPU.')
else:
    print('Using CPU.')

Using GPU.


In [4]:
batch_size=32

#I will use a susbset of imagenetval of 10k images 
if cuda_available:
    # path to the ImageNet dataset
    # link --> https://www.kaggle.com/datasets/titericz/imagenet1k-val
    
    imagenet_data_path = '/kaggle/input/imagenet1k-subset-100k-train-and-10k-val/imagenet_subval'

    # if 'imagenet_data_path' is empty, download a subset of ImageNet containing 2000 images (~250M) for test
    if not os.path.isdir(imagenet_data_path):
        print('%s is empty. Download a subset of ImageNet for test.' % imagenet_data_path)

    print('The ImageNet dataset files are ready.')
else:
    print('Since GPU is not found in the environment, we skip all scripts related to ImageNet evaluation.')
    
    
  
if cuda_available:
    # The following function build the data transforms for test
    def build_val_transform(size):
        return transforms.Compose([
            transforms.Resize(int(math.ceil(size / 0.875))),
            transforms.CenterCrop(size),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            ),
        ])
    
    val_data = datasets.ImageFolder(
            root=os.path.join(imagenet_data_path),
            transform=build_val_transform(224)
        )
    

    val_loader = torch.utils.data.DataLoader(
        val_data,
        batch_size=batch_size,  
        shuffle = True,
        num_workers=4,  
        pin_memory=True,
        drop_last=False,
    )
    print('The ImageNet dataloader is ready. Size : {}'.format(len(val_loader)*batch_size))
else:
    data_loader = None
    print('Since GPU is not found in the environment, we skip all scripts related to ImageNet evaluation.')

The ImageNet dataset files are ready.
The ImageNet dataloader is ready. Size : 10016


In [5]:

train_path = '/kaggle/input/imagenet1k-subset-100k-train-and-10k-val/imagenet_subtrain'

train_data = datasets.ImageFolder(
            root= train_path,
            transform=build_val_transform(224)
        )

train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size, 
        shuffle = True,
        num_workers=4,  
        pin_memory=True,
        drop_last=False,
    )

print('The ImageNet train set is ready. Size : {}'.format(len(train_loader)*batch_size))

The ImageNet train set is ready. Size : 100000


In [6]:
dataloaders = {}
dataloaders['train'] = train_loader
dataloaders['val'] = val_loader

dataset_sizes = {'train': len(train_loader)*32,
                'val': len(val_loader)*32}
print(dataset_sizes)

{'train': 100000, 'val': 10016}


In [7]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
    since = time.time()

    #storing epoch data
    epoch_data = {
        'epoch': [],
        'train': {'loss': [], 'top1_acc': [], 'top5_acc': []},
        'val': {'loss': [], 'top1_acc': [], 'top5_acc': []}
    }
    
    # Create a temporary directory
    tempdir = '/kaggle/working/temp'
    os.makedirs(tempdir, exist_ok=True)
    best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

    torch.save(model.state_dict(), best_model_params_path)
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)
        epoch_data['epoch'].append(epoch+1)
        
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            top1_corrects = 0
            top5_corrects = 0
            
            for inputs, labels in tqdm(dataloaders[phase], leave=False):
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                
                # Calculate top-1 accuracy
                top1_corrects += torch.sum(preds == labels.data)
                
                # Calculate top-5 accuracy
                _, top5_preds = torch.topk(outputs, 5, dim=1)
                top5_corrects += torch.sum(top5_preds == labels.view(-1, 1))

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_top1_acc = top1_corrects.double() / dataset_sizes[phase]
            epoch_top5_acc = top5_corrects.double() / dataset_sizes[phase]
            
            epoch_data[phase]['loss'].append(epoch_loss)
            epoch_data[phase]['top1_acc'].append(epoch_top1_acc)
            epoch_data[phase]['top5_acc'].append(epoch_top5_acc)

            print(f'{phase} Loss: {epoch_loss:.4f} Top-1 Acc: {epoch_top1_acc:.4f} Top-5 Acc: {epoch_top5_acc:.4f}')

            if phase == 'val' and epoch_top1_acc > best_acc:
                best_acc = epoch_top1_acc
                best_top5 = epoch_top5_acc
                torch.save(model.state_dict(), best_model_params_path)

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Top-1 Acc {best_acc:4f} /n Best val Top-5: {best_top5:4f}')

    model.load_state_dict(torch.load(best_model_params_path))

    # Clean up the temporary directory
    shutil.rmtree(tempdir)

    return model, epoch_data

In [8]:
# def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
#     since = time.time()

#     #storing epoch data
#     epoch_data =     {
#         'epoch': [],
#         'train': {'loss': [], 'acc': []},
#         'val': {'loss': [], 'acc': [] }
#     }
    
#     # Create a temporary directory in Kaggle's temp directory
#     tempdir = '/kaggle/working/temp'
#     os.makedirs(tempdir, exist_ok=True)
#     best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

#     torch.save(model.state_dict(), best_model_params_path)
#     best_acc = 0.0

#     for epoch in range(num_epochs):
#         print(f'Epoch {epoch+1}/{num_epochs}')
#         print('-' * 10)
#         epoch_data['epoch'].append(epoch+1)
        
#         for phase in ['train', 'val']:
#             if phase == 'train':
#                 model.train()
#             else:
#                 model.eval()
#             running_loss = 0.0
#             running_corrects = 0

            

#             for inputs, labels in tqdm(dataloaders[phase], leave=False):
#                 inputs = inputs.to(device)
#                 labels = labels.to(device)

#                 optimizer.zero_grad()

#                 with torch.set_grad_enabled(phase == 'train'):
#                     outputs = model(inputs)
#                     _, preds = torch.max(outputs, 1)
#                     loss = criterion(outputs, labels)

#                     if phase == 'train':
#                         loss.backward()
#                         optimizer.step()

#                 running_loss += loss.item() * inputs.size(0)
#                 running_corrects += torch.sum(preds == labels.data)

#             if phase == 'train':
#                 scheduler.step()

#             epoch_loss = running_loss / dataset_sizes[phase]
#             epoch_acc = running_corrects.double() / dataset_sizes[phase]
#             epoch_data[phase]['loss'].append(epoch_loss)
#             epoch_data[phase]['acc'].append(epoch_acc)

#             print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

#             if phase == 'val' and epoch_acc > best_acc:
#                 best_acc = epoch_acc
#                 torch.save(model, best_model_params_path)

#         print()

#     time_elapsed = time.time() - since
#     print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
#     print(f'Best val Acc: {best_acc:4f}')

#     torch.load(best_model_params_path)

#     # Clean up the temporary directory
#     shutil.rmtree(tempdir)

#     return model, epoch_data

In [9]:
device = 'cuda' if cuda_available else 'cpu'


In [10]:
all_models = []
name = []
path = '/kaggle/input/searched-models-nas-fpga/models'
for file in os.listdir(path):
    m_path = os.path.join(path, file)
    name.append(file)
    all_models.append(torch.load(m_path))
    print('Loaded')



Loaded
Loaded
Loaded
Loaded
Loaded
Loaded


In [11]:
print('Retraining all models ')
data = {}

for i, model in enumerate(all_models):
    
    print(f'Training {name[i]}\n')
    
    torch.cuda.empty_cache()
    
    
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.5)
    
    model, epoch_data = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=2)
    
    torch.save(model, name[i])
    data[name[i]] = epoch_data
    print('*****************************************************')

Retraining all models 
Training moder_search_13.pth

Epoch 1/2
----------


                                                   

train Loss: 0.5530 Top-1 Acc: 0.8859 Top-5 Acc: 0.9813


                                                 

val Loss: 0.9671 Top-1 Acc: 0.7651 Top-5 Acc: 0.9342

Epoch 2/2
----------


                                                   

train Loss: 0.5077 Top-1 Acc: 0.8862 Top-5 Acc: 0.9810


                                                 

val Loss: 0.9443 Top-1 Acc: 0.7656 Top-5 Acc: 0.9330

Training complete in 13m 42s
Best val Top-1 Acc 0.765575 /n Best val Top-5: 0.933007
*****************************************************
Training moder_search_11.pth

Epoch 1/2
----------


                                                   

train Loss: 0.6099 Top-1 Acc: 0.8737 Top-5 Acc: 0.9775


                                                 

val Loss: 0.9903 Top-1 Acc: 0.7567 Top-5 Acc: 0.9274

Epoch 2/2
----------


                                                   

train Loss: 0.5694 Top-1 Acc: 0.8728 Top-5 Acc: 0.9780


                                                 

val Loss: 0.9765 Top-1 Acc: 0.7580 Top-5 Acc: 0.9264

Training complete in 12m 9s
Best val Top-1 Acc 0.757987 /n Best val Top-5: 0.926418
*****************************************************
Training moder_search_20.pth

Epoch 1/2
----------


                                                   

train Loss: 0.4863 Top-1 Acc: 0.8938 Top-5 Acc: 0.9845


                                                 

val Loss: 0.9155 Top-1 Acc: 0.7808 Top-5 Acc: 0.9397

Epoch 2/2
----------


                                                   

train Loss: 0.4462 Top-1 Acc: 0.8958 Top-5 Acc: 0.9846


                                                 

val Loss: 0.8862 Top-1 Acc: 0.7828 Top-5 Acc: 0.9401

Training complete in 14m 21s
Best val Top-1 Acc 0.782847 /n Best val Top-5: 0.940096
*****************************************************
Training moder_search_12.pth

Epoch 1/2
----------


                                                   

train Loss: 0.5644 Top-1 Acc: 0.8828 Top-5 Acc: 0.9801


                                                 

val Loss: 0.9777 Top-1 Acc: 0.7633 Top-5 Acc: 0.9311

Epoch 2/2
----------


                                                   

train Loss: 0.5176 Top-1 Acc: 0.8833 Top-5 Acc: 0.9801


                                                 

val Loss: 0.9557 Top-1 Acc: 0.7632 Top-5 Acc: 0.9317

Training complete in 11m 52s
Best val Top-1 Acc 0.763279 /n Best val Top-5: 0.931110
*****************************************************
Training moder_search_25.pth

Epoch 1/2
----------


                                                   

train Loss: 0.4762 Top-1 Acc: 0.8953 Top-5 Acc: 0.9850


                                                 

val Loss: 0.9287 Top-1 Acc: 0.7852 Top-5 Acc: 0.9416

Epoch 2/2
----------


                                                   

train Loss: 0.4402 Top-1 Acc: 0.8963 Top-5 Acc: 0.9853


                                                 

val Loss: 0.8969 Top-1 Acc: 0.7846 Top-5 Acc: 0.9427

Training complete in 16m 59s
Best val Top-1 Acc 0.785244 /n Best val Top-5: 0.941593
*****************************************************
Training moder_search_15.pth

Epoch 1/2
----------


                                                   

train Loss: 0.5077 Top-1 Acc: 0.8886 Top-5 Acc: 0.9827


                                                 

val Loss: 0.9442 Top-1 Acc: 0.7737 Top-5 Acc: 0.9379

Epoch 2/2
----------


                                                   

train Loss: 0.4634 Top-1 Acc: 0.8902 Top-5 Acc: 0.9833


                                                 

val Loss: 0.9183 Top-1 Acc: 0.7740 Top-5 Acc: 0.9372

Training complete in 13m 4s
Best val Top-1 Acc 0.773962 /n Best val Top-5: 0.937200
*****************************************************


In [12]:
print(data)

{'moder_search_13.pth': {'epoch': [1, 2], 'train': {'loss': [0.5529902351927757, 0.5076511818456649], 'top1_acc': [tensor(0.8859, device='cuda:0', dtype=torch.float64), tensor(0.8862, device='cuda:0', dtype=torch.float64)], 'top5_acc': [tensor(0.9813, device='cuda:0', dtype=torch.float64), tensor(0.9810, device='cuda:0', dtype=torch.float64)]}, 'val': {'loss': [0.9670505795996791, 0.9443170449223381], 'top1_acc': [tensor(0.7651, device='cuda:0', dtype=torch.float64), tensor(0.7656, device='cuda:0', dtype=torch.float64)], 'top5_acc': [tensor(0.9342, device='cuda:0', dtype=torch.float64), tensor(0.9330, device='cuda:0', dtype=torch.float64)]}}, 'moder_search_11.pth': {'epoch': [1, 2], 'train': {'loss': [0.6099257048749924, 0.5694193628692626], 'top1_acc': [tensor(0.8737, device='cuda:0', dtype=torch.float64), tensor(0.8728, device='cuda:0', dtype=torch.float64)], 'top5_acc': [tensor(0.9775, device='cuda:0', dtype=torch.float64), tensor(0.9780, device='cuda:0', dtype=torch.float64)]}, 'va

In [13]:
# def plot_model_train(epoch_data, title = ' '):
#     epochs = epoch_data['epoch']
#     train_loss = epoch_data['train']['loss']
#     val_loss = epoch_data['val']['loss']
#     train_acc = epoch_data['train']['top1_acc']
#     val_acc = epoch_data['val']['top1_acc']

#     # Plotting the training and validation loss
#     plt.figure(1)
#     plt.figure(figsize=(10,6))
#     plt.plot(epochs, train_loss, label='Training Loss', color='blue', linestyle='-', marker='o')
#     plt.plot(epochs, val_loss, label='Validation Loss', color='red', linestyle='--', marker='s')
#     plt.xlabel('Epoch', color='black')
#     plt.ylabel('Loss', color='black')
#     plt.title('Training and validation loss'+title, color='black')
#     plt.legend(loc='upper right', facecolor='white', framealpha=1)
#     plt.grid(color='gray', linestyle='--', linewidth=0.5)
#     plt.xticks(range(1,21))
#     plt.savefig('plot2.png')
#     # Plotting the training and validation accuracy
#     plt.figure(2)
#     plt.figure(figsize=(10,6))
#     plt.plot(epochs, train_acc, label='Training Accuracy', color='blue', linestyle='-', marker='^')
#     plt.plot(epochs, val_acc, label='Validation Accuracy', color='red', linestyle='--', marker='d')
#     plt.xlabel('Epoch', color='black')
#     plt.ylabel('Accuracy', color='black')
#     plt.title('Training and validation accuracy'+title, color='black')
#     plt.legend(loc='lower right', facecolor='white', framealpha=1)
#     plt.grid(color='gray', linestyle='--', linewidth=0.5)
#     plt.xticks(range(1,21))
    
#     n = random.randint(0,100)
#     plt.savefig(f'plot{n}.png')

#     plt.show()

In [14]:
# plot_model_train(epoch_data, title = 'Retraining on Imagenet dataset (weight initialization from OFA)')

In [15]:
# torch.save(model, 'model_retrained_search_11.pth')

In [16]:
# print(epoch_data)