In [None]:
#meta 2/10/2021 Kaggle Cassava Module 1-Model Mini K-fold(for rapid iteration)
#fast.ai 2020 study group Kaggle Competition
#src https://www.kaggle.com/c/cassava-leaf-disease-classification
#prev competition src https://www.kaggle.com/c/cassava-disease
#references https://arxiv.org/pdf/1908.02900.pdf

#input: exported data files from my-cassava-data-v2-mini10pct
#     df_train_kfold.pkl or df_train_mini_kfold.pkl
#output: model artifacts, submission.csv
#pwd: /kaggle/working
#based on version of my-kaggle-cassava-1-model.ipynb -> my-kaggle-cassava-1-model-mini_score0812.ipynb

#prev in my-kaggle-cassava-1-model-mini_score0812.ipynb
#2/7/2021 MODULE 1-MODEL MINI W/ INFERENCE + EXPORT, DELTA BS=64
#      $params: MINI = 1, DS_OVERSAMPLED = 0
#      in MINI mode, train mini ds (not oversampled) (exported from Module 0)
#      dataprep: splitter=ColSplitter(), item_tfms = None, batch_tfms = [ *aug_transforms(flip_vert=True, max_zoom=3.0), Normalize.from_stats(*imagenet_stats)])
#                bs=64
#      model: learn = cnn_learner(dls, resnet18, loss_func=LabelSmoothingCrossEntropy(), metrics=accuracy)
#      learn.fine_tune(15)
#      preds,_ = learn.get_preds(dl=test_dl), no TTA
#      Score: .812  Time: 10min + submission time, Rank: n/a

#history
#2/10/2021 POC ADD K-FOLD TO MODULE 1-MODEL MINI W/ INFERENCE + EXPORT, DELTA BS=32
#      $params: MINI = 1, DS_OVERSAMPLED = 0
#      in MINI mode, train mini ds (not oversampled) (exported from Module 0)
#      POC train k models and average prediction probabilities.
#      Verified submission works with N_EPOCHS_MINI=2 only (no need to care about the score 0.710)

#here 2/11/2021 ADD K-FOLD TO MODULE 1-MODEL MINI W/ INFERENCE + EXPORT, DELTA K-FOLD=5
#      $params: MINI = 1, DS_OVERSAMPLED = 0
#      in MINI mode, train mini ds (not oversampled) (exported from Module 0)
#      dataprep: splitter=ColSplitter(), item_tfms = None, batch_tfms = [ *aug_transforms(flip_vert=True, max_zoom=3.0), Normalize.from_stats(*imagenet_stats)])
#                bs=32
#      model: learn = cnn_learner(dls, resnet18, loss_func=LabelSmoothingCrossEntropy(), metrics=accuracy)
#      learn.fine_tune(15)
#      preds,_ = learn.get_preds(dl=test_dl), no TTA
#      Score: .825, .822  Time: 48min + submission time, Rank: n/a


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import os
import time #to track performance time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#modeling
from fastai.vision.all import *


# Input data files are available in the read-only "../input/" directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#--constants and variables
SEED = 24 
MINI = 1 #$params
DS_OVERSAMPLED = 0 #$params
BATCH_SIZE = 32 #$params
N_EPOCHS_FULL = 15 #$params
N_EPOCHS_MINI = 15 #$params
N_EPOCHS_FREEZE = 3 #$params

#set paths
path = Path('../input/cassava-leaf-disease-classification')
path_data = Path('../input/my-cassava-data-v2-mini10pct')
path_model = Path("/kaggle/working/")

#--functions
def get_x(row): return path/'train_images'/row['image_id']
def get_y(row): return row['label']


# Cassava Disease Classification
##### Objective: Classify pictures of cassava leaves into 1 of 4 disease categories or healthy.

Mini mode for rapid iteration


## 0. Load Data


### 0.1 Images  
21397 train images and 1 test image

In [None]:
#(path/'train_images').ls(), (path/'test_images').ls()

In [None]:
#load train/test images
#train_img = get_image_files(path/'train_images')
#test_img = get_image_files(path/'test_images')

#print("Number of files: ", (len(train_img), len(test_img)))

### 0.3 Train Dataset
Export from module 0-Data, 5 k-folds

In [None]:
#$param
if MINI:
    df_train = torch.load(path_data/'df_train_mini_kfold.pkl')
else:
    df_train = torch.load(path_data/'df_train_kfold.pkl')
        
print('train data\n', df_train.shape)
display(df_train.head())

In [None]:
#get k-folds
K_FOLDS = df_train['k_fold'].nunique()
K_FOLDS

In [None]:
#make into k datasets: 5 train sets and 5 validation sets
df_train_kfolds = []

df_train_grp = df_train.groupby('k_fold')
for idx, grp in df_train_grp:
    print(grp.shape)
    print(grp.head())
    
    #get idx for valid ds
    valid_idx = grp.index
    #print(idx_valid)
    
    #update df_train_kfold with valid index 
    df_train_kfold = df_train.copy()
    df_train_kfold['is_valid'] = 0
    df_train_kfold.loc[df_train_kfold.index.isin(valid_idx), 'is_valid'] = 1
    
    #vis with valid index 
    #print(df_train_kfold['is_valid'].value_counts())
    
    df_train_kfolds.append(df_train_kfold)

#confirm k-folds
print(len(df_train_kfolds))

In [None]:
for df_kfold in df_train_kfolds:
    print(df_kfold.head())
    print(df_kfold['is_valid'].value_counts().sort_index())

In [None]:
df_train_kfolds[0]['is_valid'].value_counts().sort_index().plot(kind='bar');


### 1.1 From Data to Dataloaders
with DataBlock

In [None]:
##dataloaders
#dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
#                 get_x = get_x,
#                 get_y = get_y,
#                 splitter=ColSplitter(), 
#                 item_tfms = None, 
#                 batch_tfms = [ *aug_transforms(flip_vert=True, max_zoom=3.0), Normalize.from_stats(*imagenet_stats)])
#                      

#dls = dblock.dataloaders(df_train, bs=BATCH_SIZE)

#len(dls.train_ds), len(dls.valid_ds)

In [None]:
#wrap dataloaders code into a function
def get_data(fold):
    this_dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
                 get_x = get_x,
                 get_y = get_y,
                 splitter=ColSplitter(), 
                 item_tfms = None, 
                 batch_tfms = [ *aug_transforms(flip_vert=True, max_zoom=3.0), Normalize.from_stats(*imagenet_stats)])
    this_dls = this_dblock.dataloaders(df_train_kfolds[fold], bs=BATCH_SIZE)
    
    return this_dls #class fastai.data.core.DataLoaders

In [None]:
#preview
dls = get_data(0)
dls.train.show_batch()

## 2. Model
Learn a model to classify a given image into these 4 disease categories or a 5th category indicating a healthy leaf, using the images in the training data

### 2.0 Pretrained model offline

In [None]:
#src Daniel
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
        os.makedirs('/root/.cache/torch/hub/checkpoints/')
!cp '../input/resnet185c106cdepth/resnet18-5c106cde.pth' '/root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth'

### 2.1 Train K-Folds
Mini training n_epochs.

In [None]:
set_seed(SEED)

#torch.cuda.empty_cache()
def free_gpu_cache():
    torch.cuda.empty_cache()

In [None]:
##track time
#t0 = time.time()

##Ch7 pg251 LabelSmoothingCrossEntropy()
#learn = cnn_learner(dls, resnet18, loss_func=LabelSmoothingCrossEntropy(), metrics=accuracy)
#if MINI:
#    learn.fine_tune(N_EPOCHS_MINI) #$param
#else:
#    learn.fine_tune(N_EPOCHS_FULL) #$param

#print ("Training time (in min)", (time.time() - t0)/60)

In [None]:
#train k-folds models
for i in range(K_FOLDS):
    
    print("fold", i)
    dls = get_data(i) # Data
    #len(dls.train),len(dls.valid)
    #dls.show(5)
    
    name = 'model_' + str(i) 
    
    #track time
    t0 = time.time()
    
    free_gpu_cache() 

    #setup learner
    learn = cnn_learner(dls, resnet18, loss_func=LabelSmoothingCrossEntropy(), metrics=accuracy)
    if MINI:
        learn.fine_tune(N_EPOCHS_MINI) #$param
    else:
        learn.fine_tune(N_EPOCHS_FULL) #$param

    print ("Training time (in min)", (time.time() - t0)/60)
    
    #save the state of your model
    #if MINI:
    #    learn.save('my-mini-15epochs64bs')
    #else:
    #    learn.save('my-full-15epochs64bs') #$param

    #export model
    learn.export('/kaggle/working/'+name+'.pkl') 
    
    #if no error CUDA out of memory
    learn.recorder.plot_loss()
    
print ("Total training time (in min)", (time.time() - t0)/60) 

In [None]:
##save the state of your model
#if MINI:
#    learn.save('my-mini-15epochs64bs')
#else:
#    learn.save('my-full-15epochs64bs') #$param
    
##export model
#learn.export()

In [None]:
#last learner
#learn.recorder.plot_loss()

### 2.2 Evaluate

In [None]:
#use exports for inference
#learn = load_learner($param path'/export.pkl')

In [None]:
#last learner
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
#last learner
interp.most_confused(min_val=10)

## 3. Predict with K-models

- Predict one image

In [None]:
#view test images - only one
test_img = get_image_files(path/'test_images')
Image.open(test_img[0])

#predict one image with one model, returns a tuple, get 0th item
y_hat = learn.predict(test_img[0])
y_hat[0]

In [None]:
#predict one image with k models, returns a tuple, get 0th item

y_hats = []
for i in range(K_FOLDS):
    learner = load_learner(path_model/f'model_{i}.pkl')
    y_hat = learn.predict(test_img[0])
    y_hats += [y_hat]

y_hats

- Predict test ds images

In [None]:
#template
sample_submission = pd.read_csv(path/'sample_submission.csv')
submission = sample_submission.copy()

#create test images id
test_img_id = [os.path.join(path/'test_images', x) for x in submission['image_id'].values]

#images in test files
test_img_id

In [None]:
#predict test images with one model
test_dl = dls.test_dl(test_img_id)
len(test_dl)

#predict, no TTA
preds,_ = learn.get_preds(dl=test_dl)
#preds

#submission
submission['label'] = preds.argmax(dim=-1).numpy()
submission.head()

In [None]:
#Predictions and DataStructs
#preds, preds.argmax(dim=-1), preds.argmax(dim=-1).numpy()

In [None]:
#predict test images with k models
fold_preds = []
for i in range(K_FOLDS):
    learner = load_learner(path_model/f'model_{i}.pkl')
    test_dl = dls.test_dl(test_img_id)
    #predict, no TTA
    preds,_ = learn.get_preds(dl=test_dl)
    #preds
    fold_preds += [preds]
preds = torch.stack(fold_preds)

#submission
submission['label'] = preds.sum(axis=0).argmax(dim=-1).numpy() #preds.argmax(dim=-1).numpy()
submission.head()

#### Predictions and DataStructs
Ideally, select mode of all k-fold predictions.  If need a tie-break, mode doesn't work.  
Better: highest mean probabilities.

In [None]:
#preds.sum(axis=0), preds.sum(axis=0).argmax(),preds.sum(axis=0).argmax(dim=-1).numpy()

## 4. Submit


In [None]:
submission.to_csv('submission.csv', index=False)


# Xtra

In [None]:
# try: solve CUDA out of memory error
#src https://www.kaggle.com/getting-started/140636
#!pip install GPUtil

#import torch
#from GPUtil import showUtilization as gpu_usage
#from numba import cuda

In [None]:
#def free_gpu_cache():
#    print("Initial GPU Usage")
#    gpu_usage()                             

#    torch.cuda.empty_cache()

    #cuda.select_device(0)
    #cuda.close()
    #cuda.select_device(0)

#    print("GPU Usage after emptying the cache")
#    gpu_usage()


In [None]:
'''#scipy.stats and mode example
a = np.array([[1, 3, 4, 2, 2, 7],
              [5, 2, 2, 1, 4, 1],
              [3, 3, 2, 2, 1, 1]])

m = stats.mode(a)
print(m)
m = stats.mode(a, axis=1)
print(m)
m = stats.mode(a, axis=None)
print(m)
'''

In [None]:
'''#originally, select Mode of all k-fold predictions
print(preds) #class 'torch.Tensor'
print(preds.argmax(dim=-1).numpy()) #class 'numpy.ndarray'
print(stats.mode(preds.argmax(dim=-1).numpy())[0]) #class 'numpy.ndarray'

#view all three
preds, preds.argmax(dim=-1).numpy(), stats.mode(preds.argmax(dim=-1).numpy())[0]
'''