In [182]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import splitfolders
import os 
import torchvision 
from sklearn.model_selection import train_test_split 
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler
from torchvision import datasets, transforms
import torch 

# Is the dataset balanced?


# DRY
 would be sophisticated to do something like `daisy = Path(root)/'daisy'`

In [57]:
root = r'..\\Task\\flowers-recognition\\flowers'
daisy = r'..\\Task\\flowers-recognition\\flowers\\daisy'
dandelion = r'..\\Task\\flowers-recognition\\flowers\\dandelion'
rose = r'..\\Task\\flowers-recognition\\flowers\\rose'
sunflower = r'..\\Task\\flowers-recognition\\flowers\\sunflower'
tulip = r'..\\Task\\flowers-recognition\\flowers\\tulip'

# Charts are more representative and more digestible than tables.

In [71]:
daisy_count = len(os.listdir(daisy))
dandelion_count = len(os.listdir(dandelion))
rose_count = len(os.listdir(rose))
sunflower_count = len(os.listdir(sunflower))
tulip_count = len(os.listdir(tulip))
total = (daisy_count + dandelion_count + rose_count + sunflower_count + tulip_count)
print("Daisy count: ", daisy_count, ' or ', round(100 * (daisy_count/total), 1), "%")
print("Dandelion count: ", dandelion_count, ' or ', round(100 * (dandelion_count/total), 1), '%')
print("Rose count: ", rose_count, ' or ', round(100 * (rose_count/total), 1), "%")
print("Sunflower count: ", sunflower_count, ' or ', round(100 * (sunflower_count/total), 1), '%')
print("Tulip count: ", tulip_count, ' or ', round(100 * (tulip_count/total), 1), "%")
print('Total images: ', total)

Daisy count:  764  or  17.7 %
Dandelion count:  1052  or  24.4 %
Rose count:  784  or  18.2 %
Sunflower count:  733  or  17.0 %
Tulip count:  984  or  22.8 %
Total images:  4317


As we can see the dataset a bit imbalanced but ratios of classes are acceptable. Imbalance ratio is small and we doesnt need to 
apply some oversampling/undersampling techniques. Further, while model training I will show that we can obtain 
good results without changing anything in dataset.

# What features should be involved in a train-test split decision? Should you use a stratification for that?

# if the dataset is small why didn't do a cross val check?

Since dataset is small and comparably balanced, we will use all images for modeling.
We can use train/test split using stratified sampling. Ratios of train/test are 80/20

Lets check the ratios of classes in train/test

Firstly, lets create PyTorch Dataset

In [None]:
root_dir = "../Task/flowers-recognition/flowers"

flower_transform = transforms.Compose([transforms.Resize((224,224)),
                                       transforms.ToTensor(), 
                                       transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
flower_dataset = datasets.ImageFolder(root_dir, transform=flower_transform)

In [215]:
flower_dataset.class_to_idx

{'daisy': 0, 'dandelion': 1, 'rose': 2, 'sunflower': 3, 'tulip': 4}

Now, define dictionary which replace key-value pairs (idx to class)

In [198]:
#Dict used for 
idx2class = {v: k for k, v in flower_dataset.class_to_idx.items()}
idx2class

{0: 'daisy', 1: 'dandelion', 2: 'rose', 3: 'sunflower', 4: 'tulip'}

For understanding distribution of classes in dataset, lets define following function

In [200]:
def get_class_distribution(dataset_obj):
    count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}
    
    for element in dataset_obj:
        y_lbl = element[1]
        y_lbl = idx2class[y_lbl]
        count_dict[y_lbl] += 1
            
    return count_dict
print("Distribution of classes: \n", get_class_distribution(flower_dataset))

Distribution of classes: 
 {'daisy': 764, 'dandelion': 1052, 'rose': 784, 'sunflower': 733, 'tulip': 984}


As we saw in previous step, class distribution isnt balanced. But imbalance level no critical

We can use sklearn's train_test_split, which let us split dataset using stratification 

In [216]:
targets = flower_dataset.targets

train_idx, valid_idx= train_test_split(
    np.arange(len(targets)), test_size=0.2, random_state=42, shuffle=True, stratify=targets)


In [217]:
train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(valid_idx)

In [218]:
train_loader = DataLoader(dataset=flower_dataset, shuffle=False, batch_size=1, sampler=train_sampler)
val_loader = DataLoader(dataset=flower_dataset, shuffle=False, batch_size=1, sampler=val_sampler)

Similarly, we can define function for understanding classes distribution in DataLoaders

In [None]:
def get_class_distribution_loaders(dataloader_obj, dataset_obj):
    count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}
    
    for _,j in dataloader_obj:
        y_idx = j.item()
        y_lbl = idx2class[y_idx]
        count_dict[str(y_lbl)] += 1
            
    return count_dict

In [209]:
get_class_distribution_loaders(train_loader, flower_dataset)

{'daisy': 611, 'dandelion': 842, 'rose': 627, 'sunflower': 586, 'tulip': 787}

In [211]:
get_class_distribution_loaders(val_loader, flower_dataset)

{'daisy': 153, 'dandelion': 210, 'rose': 157, 'sunflower': 147, 'tulip': 197}

Ratios in both train and val. sets are the same as in original dataset