# Image_Preprocessing

<a class="anchor" id="0"></a>
# Table of Contents

1. [套件安裝與載入](#1)
1. [環境檢測與設定](#2)
1. [資料處理參數設定](#3)
1. [資料處理](#4)
    -  [載入CSV檔](#4.1)
    -  [檢查CSV檔缺失值](#4.2)
1. [圖片預處理](#5)
    -  [圖片對比度轉換視覺化](#5.1)
    -  [圖片轉換](#5.2)
1. [指定座標裁切圖片](#6)

# 1. 套件安裝與載入<a class="anchor" id="1"></a>
[Back to Table of Contents](#0)

In [None]:
# 資料處理套件
import os
import cv2
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

# 2. 環境檢測與設定<a class="anchor" id="2"></a>
[Back to Table of Contents](#0)

In [None]:
'''執行環境參數設定'''

# (Boolean)是否為本機
LOCAL = False

# (Boolean)是否為 Colab
COLAB = False


'''檔案路徑參數設定'''

# (String)Root路徑
if LOCAL:
    PATH = r'../'
elif COLAB:
    PATH = r'/content/drive/My Drive/Colab Notebooks/'
else:
    PATH = r'../input/'
    OUTPUT_PATH = r'/kaggle/working/'
    
# (String)資料根路徑
# DATA_ROOT_PATH = PATH+r'datasets/AI_CUP_2020_AIMango_Grade_Classification/'
DATA_ROOT_PATH = PATH+r'data/images_001/'

# (String)訓練資料路徑
# TRAIN_DATA_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Train'
TRAIN_DATA_PATH = DATA_ROOT_PATH+r'images'

# (String)訓練CSV路徑，如為None則不讀CSV檔
TRAIN_CSV_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/train.csv'

# (String)測試資料路徑
TEST_DATA_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Dev'

# (String)測試CSV路徑，如為None則不讀CSV檔
TEST_CSV_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/dev.csv'

# (String)建立對比轉換時需要的訓練集資料夾名稱
if not LOCAL and not COLAB:
    TRAIN_PREPROCESSING_PATH = OUTPUT_PATH+r'Train_Contrast'
else:
    TRAIN_PREPROCESSING_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Train_Contrast'

# (String)建立對比轉換時需要的測試集資料夾名稱
if not LOCAL and not COLAB:
    TEST_PREPROCESSING_PATH = OUTPUT_PATH+r'Test_Contrast'
else:
    TEST_PREPROCESSING_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Test_Contrast'

# (String)建立裁切時需要的訓練集資料夾名稱
if not LOCAL and not COLAB:
    TRAIN_CROPPED_PATH = OUTPUT_PATH+r'Train_Cropped'
else:
    TRAIN_CROPPED_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Train_Cropped'

# (String)建立裁切時需要的測試集資料夾名稱
if not LOCAL and not COLAB:
    TEST_CROPPED_PATH = OUTPUT_PATH+r'Test_Cropped'
else:
    TEST_CROPPED_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Test_Cropped'

In [None]:
if not LOCAL and COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
if os.path.isfile(TRAIN_CSV_PATH) and os.path.isfile(TEST_CSV_PATH):
    LOAD_CSV = True
else:
    LOAD_CSV = False

# 3. 資料處理參數設定<a class="anchor" id="3"></a>
[Back to Table of Contents](#0)

In [None]:
'''客製參數設定'''


'''資料參數設定'''

# (Boolean)是否視覺化對比轉換後的圖
VISUALIZATION_CONTRAST_IMGS = True

# (Boolean)是否要圖片預處理
PREPROCESSING_IMGS = True

# (Int)最先裁切成的圖片尺寸(預處理適用)
CROPPED_IMAGE_SIZE = 850

# (Boolean)是否要對比轉換
CONTRAST_IMGS = False

# (Int)最終縮放後的圖片尺寸(預處理與指定座標裁切適用)
IMAGE_SIZE = 800

# (Boolean)是否要指定座標裁切圖片
CROPPED_IMGS = False

# (String)指定座標裁切圖片的圖片副檔名
IMAGE_NAME_EXTENSION = '.jpg'

# (String)指定座標裁切圖片的CSV標籤欄位
LABEL_NAME = 'grade'

In [None]:
if PREPROCESSING_IMGS:
    # 建立CONTRAST圖片時需要的資料夾
    if not os.path.isdir(TRAIN_PREPROCESSING_PATH):
        os.mkdir(TRAIN_PREPROCESSING_PATH)
        
    if not os.path.isdir(TEST_PREPROCESSING_PATH):
        os.mkdir(TEST_PREPROCESSING_PATH)

In [None]:
if CROPPED_IMGS:
    # 建立CROPPED圖片時需要的資料夾
    if not os.path.isdir(TRAIN_CROPPED_PATH):
        os.mkdir(TRAIN_CROPPED_PATH)

    if not os.path.isdir(TEST_CROPPED_PATH):
        os.mkdir(TEST_CROPPED_PATH)

# 4. 資料處理<a class="anchor" id="4"></a>
[Back to Table of Contents](#0)

## 4.1 載入CSV檔 <a class="anchor" id="4.1"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    print('Reading data...')

    # 讀取訓練資料集CSV檔
    train_csv = pd.read_csv(TRAIN_CSV_PATH,encoding="utf8")

    # 讀取測試資料集CSV檔
    test_csv = pd.read_csv(TEST_CSV_PATH,encoding="utf8")

    print('Reading data completed')

In [None]:
if LOAD_CSV:
    # 顯示訓練資料集CSV檔
    print(train_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of train_data :", train_csv.shape)

In [None]:
if LOAD_CSV:
    # 顯示測試資料集CSV檔
    print(test_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of test_data :", test_csv.shape)

## 4.2 檢查CSV檔缺失值 <a class="anchor" id="4.2"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    total = train_csv.isnull().sum().sort_values(ascending = False)
    percent = (train_csv.isnull().sum()/train_csv.isnull().count()*100).sort_values(ascending = False)
    missing_train_csv  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_train_csv.head(missing_train_csv.shape[0]))

In [None]:
if LOAD_CSV:
    print(train_csv[LABEL_NAME].value_counts())

# 5. 圖片預處理<a class="anchor" id="5"></a>
[Back to Table of Contents](#0)

## 5.1 圖片對比度轉換視覺化 <a class="anchor" id="5.1"></a>
[Back to Table of Contents](#0)

In [None]:
if VISUALIZATION_CONTRAST_IMGS:
    img = cv2.imread(os.path.join(TRAIN_DATA_PATH, os.listdir(TRAIN_DATA_PATH)[:1][0]), 0)

In [None]:
if VISUALIZATION_CONTRAST_IMGS:
    print(f"image shape: {img.shape}")
    print(f"data type: {img.dtype}")
    print(f"min: {img.min()}, max: {img.max()}")
    plt.imshow(img, cmap="gray")
    plt.show()

In [None]:
if VISUALIZATION_CONTRAST_IMGS:
    plt.figure(figsize=(7, 5))
    plt.hist(img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    plt.show()

In [None]:
if VISUALIZATION_CONTRAST_IMGS:
    equalize_img = cv2.equalizeHist(img)

    plt.figure(figsize=(14, 9))
    plt.subplot(2, 2, 1)
    plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.title(f"Original")

    plt.subplot(2, 2, 2)
    plt.imshow(equalize_img, cmap="gray")
    plt.axis("off")
    plt.title(f"Histogram")

    plt.subplot(2, 2, 3)
    plt.hist(img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)

    plt.subplot(2, 2, 4)
    plt.hist(equalize_img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    plt.show()

In [None]:
if VISUALIZATION_CONTRAST_IMGS:
    clahe = cv2.createCLAHE()
    clahe_img = clahe.apply(img)

    plt.figure(figsize=(20, 12))
    plt.subplot(2, 3, 1)
    plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.title(f"Original")

    plt.subplot(2, 3, 2)
    plt.imshow(equalize_img, cmap="gray")
    plt.axis("off")
    plt.title(f"Histogram")

    plt.subplot(2, 3, 3)
    plt.imshow(clahe_img, cmap="gray")
    plt.axis("off")
    plt.title(f"CLAHE")

    plt.subplot(2, 3, 4)
    plt.hist(img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    plt.title(f"Original")

    plt.subplot(2, 3, 5)
    plt.hist(equalize_img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    plt.title(f"Histogram")

    plt.subplot(2, 3, 6)
    plt.hist(clahe_img.reshape(-1), bins=50)
    plt.xlabel("pixel value", fontsize=14)
    plt.ylabel("Frequency", fontsize=14)
    plt.title(f"CLAHE")
    plt.show()

## 5.2 圖片轉換 <a class="anchor" id="5.2"></a>
[Back to Table of Contents](#0)

In [None]:
if PREPROCESSING_IMGS:
    # 先裁切，再對比轉換，再縮放
    def img_preprocess(filepath, img_shape=(800, 800), clahe_or_not=True, crop_shape=None):
        img = cv2.imread(filepath, 0)
        h, w = img.shape[0], img.shape[1]
        ch, cw = h//2, w//2

        if crop_shape!=None:
            l_h, l_w = crop_shape[0]//2, crop_shape[1]//2
            img = img[ch-l_h:ch+l_h, cw-l_w:cw+l_w]

        if clahe_or_not:
            clahe = cv2.createCLAHE()
            clahe_img = clahe.apply(img)
            img = clahe_img

        if (img.shape[0], img.shape[1])!= img_shape:
            img = cv2.resize(img, img_shape)

        return img

In [None]:
if PREPROCESSING_IMGS:
    for each_file in tqdm(os.listdir(TRAIN_DATA_PATH)):
        old_path = os.path.join(TRAIN_DATA_PATH, each_file)
        img = img_preprocess(old_path, 
                    img_shape=(IMAGE_SIZE, IMAGE_SIZE),
                    clahe_or_not=CONTRAST_IMGS,
                    crop_shape=(CROPPED_IMAGE_SIZE, CROPPED_IMAGE_SIZE))
        new_path = TRAIN_PREPROCESSING_PATH
        cv2.imwrite(os.path.join(new_path, each_file), img)
        
    for each_file in tqdm(os.listdir(TEST_DATA_PATH)):
        old_path = os.path.join(TEST_CONTRAST_PATH, each_file)
        img = img_preprocess(old_path, 
                    img_shape=(IMAGE_SIZE, IMAGE_SIZE),
                    clahe_or_not=CONTRAST_IMGS,
                    crop_shape=(CROPPED_IMAGE_SIZE, CROPPED_IMAGE_SIZE))
        new_path = TEST_PREPROCESSING_PATH
        cv2.imwrite(os.path.join(new_path, each_file), img)

# 6. 指定座標裁切圖片<a class="anchor" id="6"></a>
[Back to Table of Contents](#0)

In [None]:
if CROPPED_IMGS and LOAD_CSV:
    print("Processing")
    
    for index,row in train_csv.iterrows():
        filepath=TRAIN_DATA_PATH+'/'+row['image_id']
        img = cv2.imdecode(np.fromfile(filepath),cv2.IMREAD_COLOR)

        # 裁切區域的 x 與 y 座標（左上角）
        x = row['pos_x']
        y = row['pos_y']

        # 裁切區域的長度與寬度
        w = row['width']
        h = row['height']

        if x<0:
            x = 0
            w =  w + abs(x)

        if y<0:
            y = 0
            h =  h + abs(y)

        # 裁切圖片
        img = img[y:y+h, x:x+w]

        # 因為圖片比例大小不同，所以把圖片的比例壓縮
        img = cv2.resize(img,(IMAGE_SIZE,IMAGE_SIZE),interpolation=cv2.INTER_LINEAR)

        #寫入圖片到資料夾
        cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tofile(TRAIN_CROPPED_PATH+"/"+row['image_id'])

    for index,row in  test_csv.iterrows():
        filepath=TEST_DATA_PATH+'/'+row['image_id'] 
        img = cv2.imdecode(np.fromfile(filepath),cv2.IMREAD_COLOR)

        # 裁切區域的 x 與 y 座標（左上角）
        x = row['pos_x']
        y = row['pos_y']

        # 裁切區域的長度與寬度
        w = row['width']
        h = row['height']

        if x<0:
            x = 0
            w =  w + abs(x)

        if y<0:
            y = 0
            h =  h + abs(y)

        # 裁切圖片
        img = img[y:y+h, x:x+w]

        # 因為圖片比例大小不同，所以把圖片的比例壓縮
        img = cv2.resize(img,(IMAGE_SIZE,IMAGE_SIZE),interpolation=cv2.INTER_LINEAR)

        #寫入圖片到資料夾
        cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tofile(TEST_CROPPED_PATH+"/"+row['image_id'])

    print("Done")

[Go to Top](#0)