# Data / Feature Engineering

In this notebook, we acheive the following feature engineering efforts:
    - use image processing library to parse the image into matrices
    - we reshape the matrice 
    - store the matrice into table 

In [1]:
from IPython.display import display, HTML, Image , Markdown
from snowflake.snowpark.session import Session
import snowflake.snowpark.types as T
import snowflake.snowpark.functions as F
import os ,configparser ,json ,logging

# Import the commonly defined utility scripts using
# dynamic path include
import sys
sys.path.append('../python/lutils')
import sflk_base as L

display(Markdown("### Initialization"))
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)

# Source various helper functions
%run ./scripts/notebook_helpers.py

# Define the project home directory, this is used for locating the config.ini file
PROJECT_HOME_DIR = '../../'
config = L.get_config('../../')

sp_session = L.connect_to_snowflake(PROJECT_HOME_DIR)


if(sp_session == None):
    raise Exception(f'Unable to connect to snowflake. Validate connection information ')

sp_session.use_role(f'''{config['APP_DB']['role']}''')
sp_session.use_schema(f'''{config['APP_DB']['database']}.{config['APP_DB']['schema']}''')
sp_session.use_warehouse(f'''{config['APP_DB']['snow_opt_wh']}''')

df = sp_session.sql('select current_user() ,current_role() ,current_database() ,current_schema();').to_pandas()
display(df)

### Initialization

Unnamed: 0,CURRENT_USER(),CURRENT_ROLE(),CURRENT_DATABASE(),CURRENT_SCHEMA()
0,BECKY,PUBLIC,INDSOL_DICOM_DB,PUBLIC


In [2]:
import numpy as np
import cv2
# converts the image to array
def convert_image_to_nparray(p_image_fl_path ,p_img_size ,p_class_label):
    ex = ''
    status = False
    resized_arr = []
    try:
        img_arr = cv2.imread(p_image_fl_path, cv2.IMREAD_GRAYSCALE)
        resized_arr = cv2.resize(img_arr, (p_img_size, p_img_size)) # Reshaping images to preferred size
        status = True
    except Exception as e:
        ex = str(e)
    return (status ,ex ,resized_arr)

# iterate the data directory and for each image; we convert to matrice
# and then reshape the matrices
labels = ['PNEUMONIA', 'NORMAL']
img_size = 150
def images_to_pddf(p_data_dir):
    data = [] 
    for label in labels: 
        path = os.path.join(p_data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            img_flpath = os.path.join(path, img)
            status ,ex ,image_arr = convert_image_to_nparray(img_flpath ,img_size ,label)
            
            arr_shape = np.shape(image_arr)

            normalized_arr = np.array(image_arr) / 255
            
            resized_feature = normalized_arr.reshape(-1, img_size, img_size, 1)
            data.append( (img_flpath ,label ,class_num ,status ,ex 
                ,image_arr.flatten() ,arr_shape[0] ,arr_shape[1] 
                ,normalized_arr.flatten() 
                ,resized_feature.flatten()) )

    return data

In [3]:
def images_to_pddf(p_data_dir):
    data = [] 
    for label in labels: 
        path = os.path.join(p_data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            img_flpath = os.path.join(path, img)
            status ,ex ,image_arr = convert_image_to_nparray(img_flpath ,img_size ,label)
            
            arr_shape = np.shape(image_arr)

            normalized_arr = np.array(image_arr) / 255
            
            resized_feature = normalized_arr.reshape(-1, img_size, img_size, 1)
            data.append( (img_flpath ,label ,class_num ,status ,ex
                ,arr_shape[0] ,arr_shape[1] 
                ,image_arr.flatten()
                ,normalized_arr.flatten() 
                ,resized_feature.flatten()) )

    return data

In [5]:
import pandas as pd

list_1 = images_to_pddf('../../data/train')
list_2 = images_to_pddf('../../data/test')
list_3 = images_to_pddf('../../data/val')



images_parsed_list = list_1 + list_2 + list_3


In [6]:
# The parsed images are then stored in a table

images_parsed_pddf = pd.DataFrame(images_parsed_list
    , columns =['image_filepath', 'class_label','class_num','status','parsing_exception'
        ,'image_array' ,'image_array_shape_0' ,'image_array_shape_1' 
        ,'normalized_image_array' ,'resized_feature'])

images_parsed_pddf.columns = map(lambda x: str(x).upper(), images_parsed_pddf.columns)

tbl_schema = T.StructType([
    T.StructField('IMAGE_FILEPATH', T.StringType())
    ,T.StructField('CLASS_LABEL', T.StringType())
    ,T.StructField('CLASS_NUM', T.IntegerType())
    ,T.StructField('STATUS', T.BooleanType())
    ,T.StructField('PARSING_EXCEPTION', T.StringType())
    ,T.StructField('IMAGE_ARRAY', T.VariantType())
    ,T.StructField('IMAGE_ARRAY_SHAPE_0', T.IntegerType())
    ,T.StructField('IMAGE_ARRAY_SHAPE_1', T.IntegerType())
    ,T.StructField('NORMALIZED_IMAGE_ARRAY', T.VariantType())
    ,T.StructField('RESIZED_FEATURE', T.VariantType())
])

img_table = f'''{config['APP_DB']['database']}.public.images_parsed'''

df = sp_session.create_dataframe(images_parsed_pddf
    , schema=tbl_schema)
df.write.save_as_table(img_table, mode="overwrite" ,table_type='transient')

In [6]:
img_table = f'''{config['APP_DB']['database']}.public.images_parsed'''

df = sp_session.table(img_table).limit(5).to_pandas()
display(df)

Unnamed: 0,IMAGE_FILEPATH,CLASS_LABEL,CLASS_NUM,STATUS,PARSING_EXCEPTION,IMAGE_ARRAY,IMAGE_ARRAY_SHAPE_0,IMAGE_ARRAY_SHAPE_1,NORMALIZED_IMAGE_ARRAY,RESIZED_FEATURE
0,../../data/val/PNEUMONIA/person1609_virus_2791...,PNEUMONIA,0,True,,150,150,"[\n 0,\n 0,\n 0,\n 0,\n 0,\n 1,\n 2,\n ...","[\n 0.000000000000000e+00,\n 0.0000000000000...","[\n 0.000000000000000e+00,\n 0.0000000000000..."
1,../../data/val/PNEUMONIA/person301_bacteria_14...,PNEUMONIA,0,True,,150,150,"[\n 26,\n 25,\n 24,\n 24,\n 27,\n 26,\n ...","[\n 1.019607843137255e-01,\n 9.8039215686274...","[\n 1.019607843137255e-01,\n 9.8039215686274..."
2,../../data/train/PNEUMONIA/person1481_bacteria...,PNEUMONIA,0,True,,150,150,"[\n 3,\n 3,\n 2,\n 1,\n 1,\n 0,\n 0,\n ...","[\n 1.176470588235294e-02,\n 1.1764705882352...","[\n 1.176470588235294e-02,\n 1.1764705882352..."
3,../../data/train/PNEUMONIA/person1562_bacteria...,PNEUMONIA,0,True,,150,150,"[\n 61,\n 59,\n 78,\n 66,\n 86,\n 177,\n...","[\n 2.392156862745098e-01,\n 2.3137254901960...","[\n 2.392156862745098e-01,\n 2.3137254901960..."
4,../../data/train/PNEUMONIA/person490_bacteria_...,PNEUMONIA,0,True,,150,150,"[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n ...","[\n 0.000000000000000e+00,\n 0.0000000000000...","[\n 0.000000000000000e+00,\n 0.0000000000000..."


--- 
### Closeout

    With that we are finished this section of the demo setup

In [8]:
sp_session.close()
print('Finished!!!')

Finished!!!
