## Preprocessing files

+ **This notebook aims to preprocess the source files taken from the camera. Specifically:**
    + Renaming all the files setting each name as a CLASS(OK/NOK)-COUNTER.png for each reference
    + Converting the files from original .tiff to .jpg
    + Removing the background to make it easier for the model to distinguish the defects (This step was made with other software so it is not containing in this notebook)
    + Resizing the files to (990,990) to reduce the dimensions
    + Splitting the data into TRAIN, VAL and TEST and coping them into each folder set

In [None]:
# importing libraries
import glob
from shutil import copyfile
from PIL import Image
import os
import random

### Renaming files

In [34]:
# to choose which files we want to rename we must change CLASS (OK/NOK) and REF(1/2/3/4/5)
CLASS="OK"
REF="1"

# you should change these directories to yours
source_list="E:\EJEMPLOS\FINALES\REF"+REF+"\\"+CLASS+"\\*"
target_list="E:\EJEMPLOS\FINALES_RENAMED\REF"+REF+"\\"+CLASS+"\\"

source_list_complete=glob.glob(source_list)
counter=0
for i in range(len(source_list_complete)):
        copyfile(source_list_complete[i], target_list+CLASS+"-"+str(counter)+".png")
        counter=counter+1

### Converting files

In [None]:
# to choose which files we want to convert we must change CLASS (OK/NOK) and REF (1/2/3/4/5)

CLASS="NOK"
REF="5"
SET="TRAIN"

# you should change these directories to yours
source_list="E:\EJEMPLOS\FINALES_RENAMED\REF"+REF+"\\"+CLASS+"\\*"
target_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM\\REF"+REF+"\\"+CLASS+"\\"
source_list_complete=glob.glob(source_list)
pct_train=0.7
for i in range(round(len(source_list_complete)*pct_train)):
    im = Image.open(source_list_complete[i])
    name=source_list_complete[i].split("\\")[5].split(".")[0]
    im.convert('RGB').save(target_list+name+".jpg","JPEG") #this converts png image as jpeg

In [None]:
# to choose which files we want to convert we must change CLASS (OK/NOK) and REF (1/2/3/4/5)

CLASS="NOK"
REF="5"
SET="TEST"

# you should change these directories to yours
source_list="E:\EJEMPLOS\FINALES_RENAMED\REF"+REF+"\\"+CLASS+"\\*"
target_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM\\REF"+REF+"\\"+CLASS+"\\"
source_list_complete=glob.glob(source_list)
pct_train=0.7
for i in range(round(len(source_list_complete)*pct_train),len(source_list_complete)):
    im = Image.open(source_list_complete[i])
    name=source_list_complete[i].split("\\")[5].split(".")[0]
    im.convert('RGB').save(target_list+name+".jpg","JPEG") #this converts png image as jpeg

### Resizing files

In [None]:
# we convert the original image size to (990,990) because that's the minimum dimensions we believe the defects are visible
# you should change these directories to yours

source_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM(2448x2048)\\REF6_SINFONDO\\NOK\\*"
target_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM(2448x2048)\\REF6_SINFONDO_990x990\\NOK\\"
source_list_complete=glob.glob(source_list)
for i in range(len(source_list_complete)):
    im = Image.open(source_list_complete[i])
    name=source_list_complete[i].split("\\")[7].split(".")[0]
    im_resize=im.resize((990,990))
    im_resize.convert('RGB').save(target_list+name+".jpg","JPEG") #this converts png image as jpeg

### Splitting the files

In [None]:
# we split the data into TRAIN, VAL AND TEST 
# you should change these directories to yours

source_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM_990x990\\OK\\*"

train_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM_990x990\\TRAIN\\OK\\"
val_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM_990x990\\VAL\\OK\\"
test_list="C:\\Users\\adria\\Desktop\\IMAGENES_TFM_990x990\\TEST\\OK\\"

source_list_complete=glob.glob(source_list)
source_list_complete.sort()
random.seed(200)
random.shuffle(source_list_complete)

pct_train=0.75
pct_val=0.10
split_1 = int(pct_train * len(source_list_complete))
split_2 = int((pct_train+pct_val)* len(source_list_complete))

train_filenames = source_list_complete[:split_1]
val_filenames = source_list_complete[split_1:split_2]
test_filenames = source_list_complete[split_2:]

for i in range(round(len(train_filenames))):
    im = Image.open(train_filenames[i])
    name=train_filenames[i].split("\\")[6].split(".")[0]
    copyfile(train_filenames[i], train_list+name+'.jpg')
    
for i in range(round(len(val_filenames))):
    im = Image.open(val_filenames[i])
    name=val_filenames[i].split("\\")[6].split(".")[0]
    copyfile(val_filenames[i], val_list+name+'.jpg')
    
for i in range(round(len(test_filenames))):
    im = Image.open(test_filenames[i])
    name=test_filenames[i].split("\\")[6].split(".")[0]
    copyfile(test_filenames[i], test_list+name+'.jpg')