In [None]:
import numpy as np
import torch
from torchvision import transforms, datasets, models
import torch.utils.data 
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
import sys
import copy
import os 
import glob
from shutil import copyfile
from os import listdir
from PIL import Image
import random
from torch.optim import lr_scheduler

In [None]:
#computation of the sampler weights for each images based on the class
def make_weights_for_balanced_classes(images, nclasses):                        
    count = [0] * nclasses                                                      
    for item in images:                                                         
        count[item[1]] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = float(nclasses)/(N*float(count[i]))                                 
    weight = [0] * len(images)                                              
    for idx, val in enumerate(images):                                          
        weight[idx] = weight_per_class[val[1]]                                  
    return weight

In [None]:
#create new directories to split train and validation sets accordingly
directories = ["/home/Anthony/data/train_data", "/home/Anthony/data/val_data"] 

for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        
for i in os.listdir("/home/Anthony/data/train"):
    if not os.path.exists(os.path.join(directories[0], i)):
        os.makedirs(os.path.join(directories[0], i))
    if not os.path.exists(os.path.join(directories[1], i)):
        os.makedirs(os.path.join(directories[1], i))    
        
train_fraction = 0.8

for i in os.listdir("/home/Anthony/data/train"):
    images = os.listdir(os.path.join("/home/Anthony/data/train/", i))
    n_images = len(images)
    n_train_images = int(n_images*train_fraction)
    n_val_images = n_images - n_train_images

    for j in range(n_train_images):
        copyfile(os.path.join(os.path.join("/home/Anthony/data/train/", i), images[j]) , os.path.join(os.path.join("/home/Anthony/data/train_data/", i), images[j]))
        
    for j in range(n_val_images):
        copyfile(os.path.join(os.path.join("/home/Anthony/data/train/", i), images[n_train_images + j]) , os.path.join(os.path.join("/home/Anthony/data/val_data/", i), images[n_train_images + j]))

In [None]:
#load dataset
dataset = ImageFolder(root='/home/Anthony/data/train')

class_names = dataset.classes
print(class_names)
print('Dataset size = {:.0f}'.format(len(dataset)))

TRAIN_TRANSF = transforms.Compose([
    transforms.RandomRotation(50),
    transforms.CenterCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])


VAL_TRANSF = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

train_set = ImageFolder(root='/home/Anthony/data/train_data', transform=TRAIN_TRANSF,)
val_set = ImageFolder(root='/home/Anthony/data/val_data', transform=VAL_TRANSF,)

# For unbalanced dataset we create a weighted sampler                       
weights = make_weights_for_balanced_classes(train_set.imgs, len(dataset.classes))                                                                
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))


train_size = len(train_set)
val_size = len(val_set)
print('Train size = {:.0f}, Val size = {:.0f}'.format(train_size, val_size))

batch_size = 64

train_data_loader = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=4)
val_data_loader = torch.utils.data.DataLoader(val_set, batch_size, shuffle=True, num_workers=4)


if not torch.cuda.is_available():
    print("WARNING: CUDA is not available.")
device = torch.device("cuda")

In [None]:
#show one image of the training set
i = 10
img = (train_set[i][0])
img = img.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img = std * img + mean
img = np.clip(img, 0, 1)
plt.imshow(img)

print(train_set[i][0].shape)
print(class_names[train_set[i][1]])

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            i = 1
            # Iterate over data.
            if phase == 'train':
                for inputs, labels in train_data_loader:
                    if (i%100==0):
                        print('Batch {:.0f}/{:.0f}\r'.format(i, train_size/batch_size),)
                    i += 1
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        loss.backward()
                        optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    
                    
                epoch_loss = running_loss / train_size
                epoch_acc = running_corrects.double() / train_size
            
                print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                        phase, epoch_loss, epoch_acc))
            
            if phase == 'val':
                for inputs, labels in val_data_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)

                epoch_loss = running_loss / val_size
                epoch_acc = running_corrects.double() / val_size

                print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                        phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()
        
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    torch.save(model, 'snakes_2.pth')
    return model

In [None]:
#pretrained model selection
model = models.resnext50_32x4d(pretrained=True)
#print(model)

In [None]:
#number of blocks in the model
count = 0
for child in model.children():
    count+=1
print(count)

In [None]:
#freeze half of he network

#count = 0
#for child in model.children():
#    count+=1
#    if count < 7:
#        for param in child.parameters():
#            param.requires_grad = False
    

num_classes = 45

#add a more complex fc network at the end

#num_ftrs = model.fc.in_features
#print(num_ftrs)
#model.fc = nn.Linear(num_ftrs, num_classes)

#model.classifier = nn.Sequential(nn.Linear(25088, 4096, bias=True),
 #                               nn.ReLU(),
  #                              nn.Dropout(0.5),
   #                             nn.Linear(4096, 256, bias=True),
    #                            nn.ReLU(),
     #                           nn.Dropout(0.5),
      #                          nn.Linear(256, num_classes))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0015, weight_decay=1e-5)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

model.to(device)

In [None]:
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=5)

In [None]:
clss = ['pantherophis_spiloides', 'crotalus_pyrrhus','nerodia_rhombifer','pantherophis_alleghaniensis','thamnophis_sirtalis', 'natrix_natrix','crotalus_adamanteus','charina_bottae','pituophis_catenifer',
        'lampropeltis_triangulum','nerodia_erythrogaster','thamnophis_marcianus','thamnophis_proximus','lampropeltis_californiae','crotalus_ruber','rhinocheilus_lecontei','opheodrys_aestivus','thamnophis_ordinoides',
        'thamnophis_radix','masticophis_flagellum','pantherophis_vulpinus','hierophis_viridiflavus','heterodon_platirhinos','pantherophis_emoryi','regina_septemvittata','haldea_striatula','diadophis_punctatus',
        'nerodia_fasciata','storeria_occipitomaculata','crotalus_scutulatus','nerodia_sipedon','storeria_dekayi','crotalus_viridis','opheodrys_vernalis','boa_imperator','pantherophis_obsoletus','crotalus_horridus',
        'lichanura_trivirgata','agkistrodon_contortrix','thamnophis_elegans','agkistrodon_piscivorus','pantherophis_guttatus','crotalus_atrox','carphophis_amoenus','coluber_constrictor']


nb_classes = 45
model.eval()

#compute confusion matrix
confusion_matrix = torch.zeros(nb_classes, nb_classes)
with torch.no_grad():
    for i, (inputs, classes) in enumerate(val_data_loader):
        inputs = inputs.to(device)
        classes = classes.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        for t, p in zip(classes.view(-1), preds.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1

print(confusion_matrix)

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

conf_mat = confusion_matrix.numpy()
conf_mat_norm = conf_mat / conf_mat.max(axis=1)
df_cm = pd.DataFrame(conf_mat_norm, index = [i for i in clss],
                  columns = [i for i in clss])
plt.figure(figsize = (20,14))
sn.heatmap(df_cm)

In [None]:
#creation of submission file

AICROWD_TEST_IMAGES_PATH = os.getenv('AICROWD_TEST_IMAGES_PATH', '/home/Anthony/data/test/round1')
AICROWD_PREDICTIONS_OUTPUT_PATH = os.getenv('AICROWD_PREDICTIONS_OUTPUT_PATH', '/home/Anthony/data/submission_resnext50.csv')

test_data = ImageFolder(root='/home/Anthony/data/test', transform=VAL_TRANSF)
test_data_loader = torch.utils.data.DataLoader(test_data, 1, shuffle=False)
model.eval()
print('Total test images {:.0f}'.format(len(test_data)))

LINES = []

#with open('/home/Anthony/data/class_idx_mapping.csv') as f:
#    classes = ['filename']
#    for line in f.readlines()[1:]:
#        class_name = line.split(",")[0]
#        classes.append(class_name)

LINES.append(','.join(clss))

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

k=0    
images_path = AICROWD_TEST_IMAGES_PATH + '/*.jpg'
for inp, _ in test_data_loader:
    inp = inp.to(device)    
    _file_path = glob.glob(images_path)[k]
    out = (model(inp).data[0].cpu().numpy())
    probs = softmax(out)
    probs = list(map(str, probs))
    LINES.append(",".join([os.path.basename(_file_path)] + probs))
    if (k % 1000 == 0):
        print('Total predictions so far {:.0f}'.format(k))
    k += 1
    

fp = open(AICROWD_PREDICTIONS_OUTPUT_PATH, "w")
fp.write("\n".join(LINES))
fp.close()

BASIC PIPELINE FOR ALL EXPERIMENTS

-Split the dataset into training and validation data

-Preprocess the images (normalize and crop), so they fit into the model we will be using

-Upload a pretrained model and add a fully connected layer to it in order to fine tune the model on our data

-Train our model for 5 epochs (or more) and check accuracy on the validation data


• First approach

The first approach was to set up all the code for dataset splitting, preprocessing, training and submission creation, and to build a very simple model (only two stacked convolutional layers with max pooling and dropout, with a final fully connected layer) to test if the code worked in the right way. 
As expected the simple model was not suitable for the given classification problem and obtained a max of 0.21 accuracy on the validation set.


• Transfer learning

As the simple CNN used at the beginnning did not provide satisfying results at all, one of the pretrained models available in the pytorch libraries was uploaded in order to perform transfer learning. 

-The first model chosen was ResNet-18. Only the second half of the network was trained, using the SGD optimizer for 5 epochs, without augmentation of the training data. The best validation accuracy was 0.53, that is a good improvement wrt the previous model.

-A different optimizer was tested. The same previous network was trained with the Adam optimizer obtaining a 0.56 acc on val set.

-A second fully connected layer was added in order to not pass directly from a 512dim feature vector to the 45 classes of the classifier, but that worsen the validation accuracy to 0.51, probably because the number of neurons was not chosen appropriately and the optimizer learning rate was not good enough. 
For the next models the second fc layer was not taken into consideration, also because its addition increases the training time.

-The following solution was to use a deeper architecture. The new chosen model was ResNet-50. Also in this case only the second half of the layers was trained, with Adam optimizer and for 5 epochs, without augmentation of the training data. The deeper model obtained an accuracy of 0.62 on the validation set.

-Without going even deeper (incresing significantly the training time), using for example ResNet-101 or ResNet-152, the new solution was to upload a more complex model: ResNext-50 (32x4d), which obtains better results on the Imagenet dataset wrt ResNet-50. 
In this step a data augmentation on the training set was performed (random rotations, horizontal and vertical flips). 
The model was trained in three different ways:
-only final fully connected layer trained, with all pretrained parameters frozen => 0.46 accuracy on val set
-second half of the network trained, with the other parameters frozen => 0.64 accuracy on val set
-entire network trained, no parameters frozen => 0.68 accuracy on val set

-The last solution was to add a sampler to the data loader, in order to take account for the classes imbalance. The sampler simply takes for the training process the images belonging to a low represented class with high probability, while the images belonging to a class containing a high number of elements are sampled with low probability. 
This solution worsened the results for the ResNext-50 with half parameters frozen obtaining a max accuracy of 0.59 (the network converged slowly so it was trained for 15 epochs). This was probably due to the presence of the same classes imbalance in the validation set.

-The batch size used for all the experiments was 64. A couple of initial tests showed that a different size (for example 32 or 128) do not affect very much the results.
