In [None]:
import re, math, os, cv2, random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from PIL import Image 

In [None]:
myList = os.listdir('../input/cassava-leaf-disease-classification')

### Notebooks that were a help


- [cdeotte's How to Create TFrecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords)
- [dimitreoliveira's CLD Notebook](https://www.kaggle.com/dimitreoliveira/cassava-leaf-disease-stratified-tfrecords-256x256)
- [TF Docs](https://www.tensorflow.org/tutorials/load_data/tfrecord)

In [None]:
#setting seeds for reproducability
SEED = 3141

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(SEED)    

In [None]:
#2020
# PATH_TO_IMG = '../input/cassava-leaf-disease-classification/train_images/'
# IMG_PATH = '../input/cassava-leaf-disease-classification/train_images'

# train_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
# print("train_images: {}".format(train_df.shape[0]))
# train_df

In [None]:
#Merged 2019 + 2020 data

PATH_TO_IMG = '../input/cassava-leaf-disease-merged/train/'
IMG_PATH = '../input/cassava-leaf-disease-merged/train'

train_df = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')
print("train_images: {}".format(train_df.shape[0]))

### Test: Removing Noisy Images

- After training a respectable model to 0.883 LB accuracy I used this model to view the each prediction made on the validation dataset. It might be beneficial to remove some outliers/really noisy image in order to make our model more predictive on unseen data.

- Another interesting solution could be to replace some of what the model think to be wrong labels with the predicted label. Testing out that solution to see if it helps?

In [None]:
# ni_df = pd.read_csv('../input/cld-noisy-labels/submission.csv')
# ni_df = ni_df.sort_values('confidence') #sorting by prediction confidence

# #reeplacing wrong labels with an argmax of over 0.80
# x = ni_df.query('label != pred_label and confidence > 0.85')
# x = x.reset_index()#getting incorrect preds with high confidence

# for pair in x[['index','pred_label']].values:
#     train_df.loc[pair[0],'label'] = pair[1]

# train_df = train_df.reset_index(drop=True)

Experimenting which parts of the data it would be best to remove.

- Test 1: 100 highest + 100 lowest prediction confidence
- Test 2: 200 highest prediction confidence
- Test 3: 200 lowest prediction confidence

### TFrecord Functions

In [None]:
def _bytes_feature(value):
    #Returns a bytes_list from a string / byte.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    #Returns a float_list from a float / double.
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    #Returns an int64_list from a bool / enum / int / uint.
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

The cell below is how we serialize each image, and will change based on the features that we want to encode in the tfrecord and their respective data_types.

In [None]:
def serialize_example(image, target, image_name):
    feature = {
      'image': _bytes_feature(image),
      'target': _int64_feature(target),
      'image_name': _bytes_feature(image_name),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

### Parameters

If I can figure out an efficient way to split the data within the tfrecords so I created a large number of files in order to perform cross validation during training.

In [None]:
N_FILES = 50 # split images into 15 files
NEW_HEIGHT, NEW_WIDTH = (512, 512)
IMG_QUALITY = 100

### Stratified K-Fold

Would like to try and see if oversampling minority labels will provide a boost to score.

In [None]:
folds = StratifiedKFold(n_splits=N_FILES, shuffle=True, random_state=SEED)
train_df['file'] = -1

#folds.split is the train_test_split of stratifiedkfolding
for fold_n, (train_idx, val_idx) in enumerate(folds.split(train_df, train_df['label'])):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train_df['file'].loc[val_idx] = fold_n

In [None]:
train_df.to_csv('train.csv',index=False)

### Writing to TFrecords

I made two functions that write to tfrecords here. 

1. Resizes all the images from 600x800 to 512x512 (smushes images)

2. Center-crops all the images to 512x512

The center-cropped images seem to do better in this case



#### Option 1: Resize

In [None]:
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
    samples = train_df[train_df['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
        for row in samples.itertuples():
            label = row.label
            image_name = row.image_id
            img_path = f'{PATH_TO_IMG}{image_name}'
            
            img = cv2.imread(img_path)
            img = cv2.resize(img, (NEW_HEIGHT, NEW_WIDTH))
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            
            example = serialize_example(img, label, str.encode(image_name))
            writer.write(example)
            

#### Option 2: Center Crop

I am using both PIL framework and openCV to manipulate and encode the images. 

Here is a discussion which I used to help transfer images between both packages. --> [PIL to CV Discussion](https://stackoverflow.com/questions/14134892/convert-image-from-pil-to-opencv-format)

I think version three had BGR imgs rather than RGB. 

NOTE: testing to see wether the BGR to RGB had an effect on submission files. 

In [None]:
# for tfrec_num in range(N_FILES):
#     print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
#     samples = train_df[train_df['file'] == tfrec_num]
#     n_samples = len(samples)
#     print(f'{n_samples} samples')
#     with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
#         for row in samples.itertuples():
#             label = row.label
#             image_name = row.image_id
#             img_path = f'{PATH_TO_IMG}{image_name}'
#             img = Image.open(img_path) #opening with PIL image to center-crop photo
            
#             #center-crop image
#             width, height = img.size   # Get dimensions
#             left = (width - NEW_WIDTH)/2
#             top = (height - NEW_HEIGHT)/2
#             right = (width + NEW_WIDTH)/2
#             bottom = (height + NEW_HEIGHT)/2
#             img = img.crop((left, top, right, bottom))
            
#             #converting to np.array
#             open_cv_image = np.array(img)
#             open_cv_image = open_cv_image[:, :, ::-1].copy() #when going from PIL to CV2 change BGR to RGB
            
#             #using cv2 package to encode image
#             img = cv2.imencode('.jpg', open_cv_image, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            
#             example = serialize_example(img, label, str.encode(image_name))
#             writer.write(example)

### Option 3 Center-Crop Merged Data

The images from the 2019 data are different shapes and sizes.

We have to take into account that we cant center crop some of the images that are less than 512x512, and in these cases we can resize these images.

In [None]:
# for tfrec_num in range(N_FILES):
#     print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
#     samples = train_df[train_df['file'] == tfrec_num]
#     n_samples = len(samples)
#     print(f'{n_samples} samples')
#     with tf.io.TFRecordWriter('Id_train%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
#         for row in samples.itertuples():
#             label = row.label
#             image_name = row.image_id
#             img_path = f'{PATH_TO_IMG}{image_name}'
#             img = Image.open(img_path) #opening with PIL image to center-crop photo
            
#             if img.size[0] >= 512 and img.size[1] >= 512:
#                 #center-crop image
#                 width, height = img.size   # Get dimensions
#                 left = (width - NEW_WIDTH)/2
#                 top = (height - NEW_HEIGHT)/2
#                 right = (width + NEW_WIDTH)/2
#                 bottom = (height + NEW_HEIGHT)/2
#                 img = img.crop((left, top, right, bottom))
               
#             else:
#                 img = img.resize((NEW_WIDTH, NEW_HEIGHT))
            
#             #converting to np.array
#             open_cv_image = np.array(img)
#             open_cv_image = open_cv_image[:, :, ::-1].copy() #when going from PIL to CV2 change BGR to RGB
            
#             #using cv2 package to encode image
#             img = cv2.imencode('.jpg', open_cv_image, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            
#             example = serialize_example(img, label, str.encode(image_name))
#             writer.write(example)