# Data Processing Notebook
Goal: Prepare Training Images for Modeling

In [13]:
import numpy as np
import pandas as pd
import time

import imageio # for opening the images into np arrays

from PIL import Image # for saving new rgb images

### Load in Data

In [5]:
data_filepath = '../protein_data/all/'
df = pd.read_csv(data_filepath + 'clean_train.csv')

### Define Function for Converting and Saving Images:

In [14]:
def save_rgb_array(im_id):
# This function takes in a image id, 
# then converts all images with that id to a single rbg image and saves it as a .png

    # create an empty array to fill with images
    rgb = np.empty((512, 512, 3))

    i = 0

    # Open each RGB color into a single np array
    for color in ['red', 'green', 'blue']:
        rgb[:,:,i] = imageio.imread('../protein_data/all/train/'+im_id+'_'+color+'.png')
        i += 1

    # Open the yellow layer separately
    yellow = imageio.imread('../protein_data/all/train/'+im_id+'_yellow.png')

    # Adding yellow to red and green to convert image to RGB instead of RGBY
    rgb[:,:,0] += yellow/2 # adding to red layer
    rgb[:,:,1] += yellow/2 # adding to green layer

    # Normalize pixel values
    rgb = rgb / rgb.max() * 255

    # Convert to uint8 type
    rgb = rgb.astype(np.uint8)
    
    # Save array as .png
    im = Image.fromarray(rgb)
    im.save('../protein_data/all/rgb_images/'+im_id+"_rgb.png")

### Convert all Images and Save:

In [20]:
start = time.time()
i = 0

for im_id in df['Id']:

    save_rgb_array(im_id)
  
    # Print out some feedback every 1000 images
    if i % 1000 == 0:
        print(i, ' images saved, Runtime: ', round((time.time() - start)/60, 3), ' minutes')
        
    i += 1

0  images saved, Runtime:  0.002  minutes
1000  images saved, Runtime:  2.848  minutes
2000  images saved, Runtime:  5.858  minutes
3000  images saved, Runtime:  9.029  minutes
4000  images saved, Runtime:  12.153  minutes
5000  images saved, Runtime:  15.275  minutes
6000  images saved, Runtime:  18.278  minutes
7000  images saved, Runtime:  21.309  minutes
8000  images saved, Runtime:  24.43  minutes
9000  images saved, Runtime:  27.61  minutes
10000  images saved, Runtime:  30.769  minutes
11000  images saved, Runtime:  34.179  minutes
12000  images saved, Runtime:  37.926  minutes
13000  images saved, Runtime:  41.799  minutes
14000  images saved, Runtime:  45.666  minutes
15000  images saved, Runtime:  49.266  minutes
16000  images saved, Runtime:  52.655  minutes
17000  images saved, Runtime:  55.851  minutes
18000  images saved, Runtime:  59.125  minutes
19000  images saved, Runtime:  62.374  minutes
20000  images saved, Runtime:  65.632  minutes
21000  images saved, Runtime:  6

<hr>

<br>