In [9]:
import numpy as np
import pandas as pd
import os
import glob
from tqdm import tqdm
from numpy import asarray, save, load
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import imshow
from PIL import Image
import matplotlib.pyplot as plt
import shutil
from shutil import copy
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from timeit import default_timer as timer
from azure.storage.blob import BlobClient

%matplotlib inline
pd.set_option('display.max_colwidth', 500)


# Mount ADLS

## Define Mount Function

In [10]:
def mountADLS():
    # azureml-core of version 1.0.72 or higher is required
    from azureml.core import Workspace, Dataset

    subscription_id = 'xxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxx'
    resource_group = 'Your Azure RG'
    workspace_name = 'Azure ML Workspace Name'
    dataset_name = 'Name of your dataset in Azure ML'

    workspace = Workspace(subscription_id, resource_group, workspace_name)

    dataset = Dataset.get_by_name(workspace, name=dataset_name)

    # Create mountcontext and mount the dataset
    mount_ctx = dataset.mount()  
    mount_ctx.start()  

    # Get the mount point
    dataset_mount_folder = mount_ctx.mount_point
    print(dataset_mount_folder)

    # List the files in the mount point
    files = os.listdir(dataset_mount_folder)
    print(files)
    return dataset_mount_folder
    

## Mount ADLS to Compute Instance

In [11]:
mount = mountADLS()

/tmp/tmpbq9_o0tg
["Phil's Upload", "Robin's Upload", 'TrainTestSetsBalanced', 'TrainTestSetsBalancedGreyscale', 'balancedSets', 'balancedSetsGreyscale', 'processed']


## Select Root Folder

In [12]:
# Define the root folder in Azure Data Lake that we want to work with
root = mount+"/Phil's Upload/"

# Parse Folders/Files

## Define Parsing & DataFrame Functions 

In [13]:
# Function to parse the folders in the root subdirectories
def folderParser(root: str) -> list:
    folderList = []
    finalList = []
    rootFolders = glob.glob(root+'*/')
    for folder in rootFolders:
        if 'archive 2' not in folder:
            folderList.append(glob.glob(folder+'/*'))
    for folder in folderList:
        for item in folder:
            finalList.append(glob.glob(item+'/*'))
    return finalList


# Function that creates a columnar dataframe from the folderParser function output
def dataframeCreation(folderlist: list) -> pd.DataFrame:
    healthySeries = []
    covidSeries = []
    pneumoniaSeries = []
    for l in folderlist:
        for item in l:
            if 'NORMAL' in item:
                healthySeries.append(item)
            elif 'COVID' in item:
                covidSeries.append(item)
            elif 'PNEUMONIA' in item:
                pneumoniaSeries.append(item)
    dictcol = {'healthy':healthySeries,'pneumonia':pneumoniaSeries,'covid':covidSeries}
    df = pd.DataFrame.from_dict(dictcol, orient='index')
    df = df.transpose()
    return df


## Parse Root Folder

In [14]:
#Create list from the folderParser function
folderlist = folderParser(root)

In [15]:
folderlist

[["/tmp/tmpbq9_o0tg/Phil's Upload/archive/test/NORMAL",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive/test/PNEUMONIA"],
 ["/tmp/tmpbq9_o0tg/Phil's Upload/archive/train/NORMAL",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive/train/PNEUMONIA"],
 ["/tmp/tmpbq9_o0tg/Phil's Upload/archive/val/NORMAL",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive/val/PNEUMONIA"],
 ["/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/COVID19",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/NORMAL",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/PNEUMONIA"],
 ["/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/COVID19",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/NORMAL",
  "/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/PNEUMONIA"]]

## Create Dataframe of Subfolders

In [16]:
# Pass list from folderParser into dataframe creation function
df = dataframeCreation(folderlist)

In [17]:
df

Unnamed: 0,healthy,pneumonia,covid
0,/tmp/tmpbq9_o0tg/Phil's Upload/archive/test/NORMAL,/tmp/tmpbq9_o0tg/Phil's Upload/archive/test/PNEUMONIA,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/COVID19
1,/tmp/tmpbq9_o0tg/Phil's Upload/archive/train/NORMAL,/tmp/tmpbq9_o0tg/Phil's Upload/archive/train/PNEUMONIA,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/COVID19
2,/tmp/tmpbq9_o0tg/Phil's Upload/archive/val/NORMAL,/tmp/tmpbq9_o0tg/Phil's Upload/archive/val/PNEUMONIA,
3,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/NORMAL,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/test/PNEUMONIA,
4,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/NORMAL,/tmp/tmpbq9_o0tg/Phil's Upload/archive 1/train/PNEUMONIA,


# Image Preprocessing

## Resize, Convert, Normalize

In [11]:
healthyTestArray = []
healthyTestLabels = []
healthyTrainArray = []
healthyTrainLabels = []
pneumoniaTestArray = []
pneumoniaTestLabels = []
pneumoniaTrainArray = []
pneumoniaTrainLabels = []
covidTestArray = []
covidTestLabels = []
covidTrainArray = []
covidTrainLabels = []

for name in df.columns:
    for folder in df[name]:
        if folder is not None:
            if 'NORMAL' in folder:
                if 'train' in folder:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        healthyTrainArray.append(img)
                        healthyTrainLabels.append(0)
                else:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        healthyTestArray.append(img)
                        healthyTestLabels.append(0)
            if 'PNEUMONIA' in folder:
                if 'train' in folder:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        pneumoniaTrainArray.append(img)
                        pneumoniaTrainLabels.append(1)
                else:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        pneumoniaTestArray.append(img)
                        pneumoniaTestLabels.append(1)
            if 'COVID' in folder:
                if 'train' in folder:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        covidTrainArray.append(img)
                        covidTrainLabels.append(2)
                else:
                    f = glob.glob(folder+'/*')
                    for image in f:
                        img = load_img(image, target_size = (300,300))
                        img = img_to_array(img)
                        img = np.divide(img,255.)
                        covidTestArray.append(img)
                        covidTestLabels.append(2)

## Convert Lists to NumPy Arrays

In [13]:
healthyTEA = np.asarray(healthyTestArray)
healthyTEL = np.asarray(healthyTestLabels)
healthyTRA = np.asarray(healthyTrainArray)
healthyTRL = np.asarray(healthyTrainLabels)
pneumoniaTEA = np.asarray(pneumoniaTestArray)
pneumoniaTEL = np.asarray(pneumoniaTestLabels)
pneumoniaTRA = np.asarray(pneumoniaTrainArray)
pneumoniaTRL = np.asarray(pneumoniaTrainLabels)
covidTEA = np.asarray(covidTestArray)
covidTEL = np.asarray(covidTestLabels)
covidTRA = np.asarray(covidTrainArray)
covidTRL = np.asarray(covidTrainLabels)

## Save NumPy Arrays To Current Directory

In [15]:
np.save('healthyTestArray',healthyTEA)
np.save('healthyTestLabels',healthyTEL)
np.save('healthyTrainArray',healthyTRA)
np.save('healthyTrainLabels',healthyTRL)
np.save('pneumoniaTestArray',pneumoniaTEA)
np.save('pneumoniaTestLabels',pneumoniaTEL)
np.save('pneumoniaTrainArray',pneumoniaTRA)
np.save('pneumoniaTrainLabels',pneumoniaTRL)
np.save('covidTestArray',covidTEA)
np.save('covidTestLabels',covidTEL)
np.save('covidTrainArray',covidTRA)
np.save('covidTrainLabels',covidTRL)

# Upload Arrays to Azure Data Lake "processed" Folder

In [17]:
storage_account_key = input("enter storage account key")

storage_url = "https://{storageaccountname}.blob.core.windows.net/" 

arraynames = []
for f in os.listdir():
    if '.npy' in f:
        arraynames.append(f)
        
for a in arraynames:
    blob_client = BlobClient(storage_url, container_name="images/processed/arrays", blob_name=a, credential=storage_account_key)
    with open(a, "rb") as data:
        blob_client.upload_blob(data)