In [25]:
# Imports

# File handling
import os
from zipfile import ZipFile

# Visualization
import  matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data and maths
import numpy as np
import pandas as pd
import random

# ML
import tensorflow as tf
from tensorflow import keras
from keras.utils import image_dataset_from_directory
from keras.preprocessing.image import ImageDataGenerator

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
#from keras.callbacks import CSVLogger # if problems with SAVING model

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [60]:
def data_split(dataframes, val_split):
    
    train_set = pd.DataFrame()
    test_set = pd.DataFrame()
    val_set = pd.DataFrame()
    
    for df in dataframes:
        df = shuffle(df)
        df.reset_index(drop=True, inplace=True)
        val = pd.DataFrame(df[:round(val_split*len(df))-1])
        val_set = pd.concat([val_set, val], ignore_index=True) # add validation data to main validation set (val_set)
        df = df.drop(val.index) # drop validation data from df
        
        df.reset_index(drop=True, inplace=True)
        train, test = train_test_split(df, train_size=0.85) # 85/15 train/test split
        train_set = pd.concat([train_set, train], ignore_index=True)
        test_set = pd.concat([test_set, test], ignore_index=True)
        
    train_set = train_set.assign(SPLIT = lambda x: ('train'))
    test_set = test_set.assign(SPLIT = lambda x: ('test'))
    val_set = val_set.assign(SPLIT = lambda x: ('valid'))
    
    df = pd.concat([train_set, test_set, val_set], ignore_index=True) # combine all 3 sets back in 1 df
    df.reset_index(drop=True, inplace=True)
    return df

In [16]:
normal_df = pd.read_excel(r'data\Normal.metadata.xlsx').assign(PATH = lambda x: ('/data/Normal/' + x['FILE NAME']))
covid_df = pd.read_excel(r'data\COVID.metadata.xlsx').assign(PATH = lambda x: ('/data/COVID/' + x['FILE NAME']))
opacity_df = pd.read_excel(r'data\Lung_Opacity.metadata.xlsx').assign(PATH = lambda x: ('/data/Lung_Opacity/' + x['FILE NAME']))
pneumonia_df = pd.read_excel(r'data\Viral Pneumonia.metadata.xlsx').assign(PATH = lambda x: ('/data/Viral Pneumonia/' + x['FILE NAME']))

df = pd.concat([normal_df, covid_df, opacity_df, pneumonia_df])
df = shuffle(df)
df.reset_index(drop=True, inplace=True)

In [66]:
df = data_split([normal_df, covid_df, opacity_df, pneumonia_df], 0.09)[['FILE NAME', 'PATH', 'SPLIT']]
df

Unnamed: 0,FILE NAME,PATH,SPLIT
0,NORMAL-2571,/data/Normal/NORMAL-2571,train
1,NORMAL-2581,/data/Normal/NORMAL-2581,train
2,NORMAL-7077,/data/Normal/NORMAL-7077,train
3,NORMAL-1917,/data/Normal/NORMAL-1917,train
4,NORMAL-5488,/data/Normal/NORMAL-5488,train
...,...,...,...
21160,Viral Pneumonia-139,/data/Viral Pneumonia/Viral Pneumonia-139,valid
21161,Viral Pneumonia-583,/data/Viral Pneumonia/Viral Pneumonia-583,valid
21162,Viral Pneumonia-995,/data/Viral Pneumonia/Viral Pneumonia-995,valid
21163,Viral Pneumonia-313,/data/Viral Pneumonia/Viral Pneumonia-313,valid


In [67]:
df['SPLIT'].value_counts()

train    16374
test      2891
valid     1900
Name: SPLIT, dtype: int64

In [68]:
print('train: ', 16193/21165)
print('test: ', 2860/21165)
print('valid: ', 1900/21165)

train:  0.7650838648712497
test:  0.13512875029529883
valid:  0.08977084809827546


In [7]:
#put the paths in the df and use one of these



# DO NOT RUN         !WORK IN PROGRESS!

# create training, testing and validation sets from df with chosen 15 bird labels only
train_set = df.loc[df['data set'] == 'train']
train = pd.DataFrame()# empty df

test_set = df.loc[df['data set'] == 'test']
test = pd.DataFrame()# empty df

valid_set = df.loc[df['data set'] == 'valid']
valid = pd.DataFrame()# empty df


for bird in labels:
    train = pd.concat([train, train_set.loc[train_set['labels'] == bird]], ignore_index=True)
    test = pd.concat([test, test_set.loc[test_set['labels'] == bird]], ignore_index=True)
    valid = pd.concat([valid, valid_set.loc[valid_set['labels'] == bird]], ignore_index=True)

# setting up folder paths for filtered training, testing and validation sets

old_test = '/data/test/'
filtered_test = '/data/new_test/'# new filepath
allfiles = os.listdir(old) # list of folder names
for f in allfiles: # move folders
    src_path = os.path.join(old, f)
    dst_path = os.path.join(filtered, f)
    os.rename(src_path, dst_path)

test = filtered

source = '/data/valid/'
destination = '/data/new_valid/'# new filepath
allfiles = os.listdir(source) # list of folder names
for f in allfiles: # move folders
    src_path = os.path.join(source, f)
    dst_path = os.path.join(destination, f)
    os.rename(src_path, dst_path)

valid = destination