# Image_Preprocessing

<a class="anchor" id="0"></a>
# Table of Contents

1. [套件安裝與載入](#1)
1. [環境檢測與設定](#2)
1. [資料處理參數設定](#3)
1. [資料處理](#4)
    -  [載入CSV檔](#4.1)
    -  [檢查CSV檔缺失值](#4.2)
1. [裁切圖片](#5)

# 1. 套件安裝與載入<a class="anchor" id="1"></a>
[Back to Table of Contents](#0)

In [None]:
# 資料處理套件
import os
import cv2
import csv
import numpy as np
import pandas as pd

# 2. 環境檢測與設定<a class="anchor" id="2"></a>
[Back to Table of Contents](#0)

In [None]:
'''執行環境參數設定'''

# (Boolean)是否為本機
LOCAL = True

# (Boolean)是否為 Colab
COLAB = False


'''檔案路徑參數設定'''

# (String)Root路徑
if LOCAL:
    PATH = r'../'
elif COLAB:
    PATH = r'/content/drive/My Drive/Colab Notebooks/'
else:
    PATH = r'../input/'
    
# (String)資料根路徑
DATA_ROOT_PATH = PATH+r'datasets/AI_CUP_2020_AIMango_Grade_Classification/' 

# (String)訓練資料路徑
TRAIN_DATA_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Train'

# (String)訓練CSV路徑，如為None則不讀CSV檔
TRAIN_CSV_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/train.csv'

# (String)測試資料路徑
TEST_DATA_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Dev'

# (String)測試CSV路徑，如為None則不讀CSV檔
TEST_CSV_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/dev.csv'

# (String)建立裁切時需要的訓練集資料夾名稱
TRAIN_CROPPED_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Train_Cropped'

# (String)建立裁切時需要的測試集資料夾名稱
TEST_CROPPED_PATH = DATA_ROOT_PATH+r'C1-P2_Train Dev/Test_Cropped'

In [None]:
if not LOCAL and COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
if os.path.isfile(TRAIN_CSV_PATH) and os.path.isfile(TEST_CSV_PATH):
    LOAD_CSV = True
else:
    LOAD_CSV = False

In [None]:
# 建立CROPPED圖片時需要的資料夾
if not os.path.isdir(TRAIN_CROPPED_PATH):
    os.mkdir(TRAIN_CROPPED_PATH)

if not os.path.isdir(TEST_CROPPED_PATH):
    os.mkdir(TEST_CROPPED_PATH)

# 3. 資料處理參數設定<a class="anchor" id="3"></a>
[Back to Table of Contents](#0)

In [None]:
'''客製參數設定'''


'''資料參數設定'''

# (Int)圖片尺寸
IMAGE_SIZE = 512

# (Boolean)是否要裁切圖片
CROPPED_IMGS = False

# (String)圖片副檔名
IMAGE_NAME_EXTENSION = '.jpg'

# (String)CSV標籤欄位
LABEL_NAME = 'grade'

# 4. 資料處理<a class="anchor" id="4"></a>
[Back to Table of Contents](#0)

## 4.1 載入CSV檔 <a class="anchor" id="4.1"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    print('Reading data...')

    # 讀取訓練資料集CSV檔
    train_csv = pd.read_csv(TRAIN_CSV_PATH,encoding="utf8")

    # 讀取測試資料集CSV檔
    test_csv = pd.read_csv(TEST_CSV_PATH,encoding="utf8")

    print('Reading data completed')

In [None]:
if LOAD_CSV:
    # 顯示訓練資料集CSV檔
    print(train_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of train_data :", train_csv.shape)

In [None]:
if LOAD_CSV:
    # 顯示測試資料集CSV檔
    print(test_csv.head())

In [None]:
if LOAD_CSV:
    print("Shape of test_data :", test_csv.shape)

## 4.2 檢查CSV檔缺失值 <a class="anchor" id="4.2"></a>
[Back to Table of Contents](#0)

In [None]:
if LOAD_CSV:
    total = train_csv.isnull().sum().sort_values(ascending = False)
    percent = (train_csv.isnull().sum()/train_csv.isnull().count()*100).sort_values(ascending = False)
    missing_train_csv  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print(missing_train_csv.head())

In [None]:
if LOAD_CSV:
    print(train_csv[LABEL_NAME].value_counts())

# 5. 裁切圖片<a class="anchor" id="5"></a>
[Back to Table of Contents](#0)

In [None]:
if CROPPED_IMGS and LOAD_CSV:
    print("Processing")
    
    for index,row in  train_csv.iterrows():
        filepath=TRAIN_DATA_PATH+'/'+row['image_id'] 
        img = cv2.imdecode(np.fromfile(filepath),cv2.IMREAD_COLOR)

        # 裁切區域的 x 與 y 座標（左上角）
        x = row['pos_x']
        y = row['pos_y']

        # 裁切區域的長度與寬度
        w = row['width']
        h = row['height']

        if x<0:
            x = 0
            w =  w + abs(x)

        if y<0:
            y = 0
            h =  h + abs(y)

        # 裁切圖片
        img = img[y:y+h, x:x+w]

        # 因為圖片比例大小不同，所以把圖片的比例壓縮
        img = cv2.resize(img,(IMAGE_SIZE,IMAGE_SIZE),interpolation=cv2.INTER_LINEAR)

        #寫入圖片到資料夾
        cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tofile(TRAIN_CROPPED_PATH+"/"+row['image_id'])

    for index,row in  test_csv.iterrows():
        filepath=TEST_DATA_PATH+'/'+row['image_id'] 
        img = cv2.imdecode(np.fromfile(filepath),cv2.IMREAD_COLOR)

        # 裁切區域的 x 與 y 座標（左上角）
        x = row['pos_x']
        y = row['pos_y']

        # 裁切區域的長度與寬度
        w = row['width']
        h = row['height']

        if x<0:
            x = 0
            w =  w + abs(x)

        if y<0:
            y = 0
            h =  h + abs(y)

        # 裁切圖片
        img = img[y:y+h, x:x+w]

        # 因為圖片比例大小不同，所以把圖片的比例壓縮
        img = cv2.resize(img,(IMAGE_SIZE,IMAGE_SIZE),interpolation=cv2.INTER_LINEAR)

        #寫入圖片到資料夾
        cv2.imencode(IMAGE_NAME_EXTENSION, img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tofile(TEST_CROPPED_PATH+"/"+row['image_id'])

    print("Done")

[Go to Top](#0)