# CSV_Preprocessing (Object_Detection)

<a class="anchor" id="0"></a>
# Table of Contents

1. [套件安裝與載入](#1)
1. [環境檢測與設定](#2)
1. [資料處理參數設定](#3)
1. [資料處理](#4)
    -  [載入CSV檔](#4.1)
    -  [檢查CSV檔缺失值](#4.2)
    -  [從元數據提取圖片寬度和高度欄位](#4.3)

# 1. 套件安裝與載入<a class="anchor" id="1"></a>
[Back to Table of Contents](#0)

In [None]:
# 資料處理套件
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
from sklearn import preprocessing

# 2. 環境檢測與設定<a class="anchor" id="2"></a>
[Back to Table of Contents](#0)

In [None]:
'''執行環境參數設定'''

# (Boolean)是否為本機
LOCAL = False

# (Boolean)是否為 Colab
COLAB = False


'''檔案路徑參數設定'''

# (String)Root路徑
if LOCAL:
    PATH = r'../'
elif COLAB:
    PATH = r'/content/drive/My Drive/Colab Notebooks/'
else:
    PATH = r'../input/'
    OUTPUT_PATH = r'/kaggle/working/'
    
# (String)資料根路徑
DATA_ROOT_PATH = PATH+r'vinbigdata-chest-xray-abnormalities-detection/'

# (String)meta資料根路徑
META_DATA_ROOT_PATH = PATH+r'vinbigdata-process-and-resize-to-png-256x256/'

# (String)CSV圖檔路徑的根路徑
CSV_IMAGE_ROOT_PATH = PATH+r'vinbigdata-256-image-dataset/'

# (String)訓練資料路徑
TRAIN_DATA_PATH = CSV_IMAGE_ROOT_PATH+r'train/'

# (String)訓練CSV路徑，如為None則不讀CSV檔
TRAIN_CSV_PATH = DATA_ROOT_PATH+r'train.csv'

# (String)META_CSV路徑，如為None則不讀META_CSV檔
TRAIN_META_CSV_PATH = META_DATA_ROOT_PATH+r'train_meta.csv'

# (String)CSV處理後的儲存檔名
TRAIN_CSV_SAVE_FILENAME = 'train_yolo.csv'

In [None]:
if not LOCAL and COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

# 3. 資料處理參數設定<a class="anchor" id="3"></a>
[Back to Table of Contents](#0)

In [None]:
'''客製參數設定'''


'''資料參數設定'''

# (String)圖片副檔名
IMAGE_NAME_EXTENSION = '.png'

# (String)CSV圖片檔名欄位
IMAGE_NAME = 'image_id'

# (String)CSV標籤欄位
LABEL_NAME = 'class_name'

# (String)CSV標籤ID欄位
LABEL_ID = 'class_id'

# (Boolean)是否標籤欄位要LabelEncoder
LABEL_ENCODER = False

# (Boolean)是否為原始數據的前處理
META_PREPROCESSING = False

if META_PREPROCESSING:
    # (Int)圖片W尺寸
    IMAGE_SIZE_W = 256

    # (Int)圖片H尺寸
    IMAGE_SIZE_H = 256
    
# (Int)指定列印進度條的位置（從0開始）
TQDM_POSITION = 0

# (Boolean)保留迭代結束時進度條的所有痕跡。如果是None，只會在position是0時離開
TQDM_LEAVE = True


''''圖表參數設定'''

# (Float)全部SNS圖表的字形縮放
ALL_SNS_FONT_SCALE = 1.0

# (Int)CSV缺失值圖表寬度
CSV_COUNTPLOT_FIGSIZE_W = 10

# (Int)CSV缺失值圖表高度
CSV_COUNTPLOT_FIGSIZE_H = 10

# (Int)CSV缺失值圖表標題字型大小
CSV_COUNTPLOT_TITLE_FONTSIZE = 20

# (Int)CSV缺失值圖表X軸標題字型大小
CSV_COUNTPLOT_XLABEL_FONTSIZE = 15

# (Int)CSV缺失值圖表Y軸標題字型大小
CSV_COUNTPLOT_YLABEL_FONTSIZE = 15

In [None]:
# 設置sns圖表縮放係數
sns.set(font_scale = ALL_SNS_FONT_SCALE)

# 4. 資料處理<a class="anchor" id="4"></a>
[Back to Table of Contents](#0)

## 4.1 載入CSV檔 <a class="anchor" id="4.1"></a>
[Back to Table of Contents](#0)

In [None]:
print('Reading data...')

# 讀取訓練資料集CSV檔
train_csv = pd.read_csv(TRAIN_CSV_PATH,encoding="utf8")
train_csv['image_path'] = TRAIN_DATA_PATH + train_csv[IMAGE_NAME] + IMAGE_NAME_EXTENSION

print('Reading data completed')

In [None]:
# 顯示訓練資料集CSV檔
train_csv.head()

In [None]:
print("Shape of train_data :", train_csv.shape)

## 4.2 檢查CSV檔缺失值 <a class="anchor" id="4.2"></a>
[Back to Table of Contents](#0)

In [None]:
# 缺失值比率
total = train_csv.isnull().sum().sort_values(ascending = False)
percent = (train_csv.isnull().sum()/train_csv.isnull().count()*100).sort_values(ascending = False)
missing_train_csv  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train_csv.head(missing_train_csv.shape[0])

In [None]:
train_csv[LABEL_NAME].value_counts()

In [None]:
f,ax = plt.subplots(figsize=(CSV_COUNTPLOT_FIGSIZE_W, CSV_COUNTPLOT_FIGSIZE_H))
sns.countplot(train_csv[LABEL_NAME], hue = train_csv[LABEL_NAME],ax = ax)
plt.title("LABEL COUNT", fontsize=CSV_COUNTPLOT_TITLE_FONTSIZE)
plt.xlabel(LABEL_NAME.upper(), fontsize=CSV_COUNTPLOT_XLABEL_FONTSIZE)
plt.ylabel("COUNT", fontsize=CSV_COUNTPLOT_YLABEL_FONTSIZE)
plt.legend()
plt.show()

In [None]:
if LABEL_ENCODER:
    le = preprocessing.LabelEncoder()
    le.fit(train_csv[LABEL_NAME])
    print(le.classes_)
    labels = le.transform(train_csv[LABEL_NAME])
    train_csv[LABEL_ID] = labels
    print(train_csv.head())

## 4.3 從元數據提取圖片寬度和高度欄位 <a class="anchor" id="4.3"></a>
[Back to Table of Contents](#0)

In [None]:
if not META_PREPROCESSING:
    print('Reading data...')

    # 讀取訓練資料集CSV檔
    train_meta_csv = pd.read_csv(TRAIN_META_CSV_PATH,encoding="utf8")

    print('Reading data completed')
    
    print(train_meta_csv.head())

In [None]:
if not META_PREPROCESSING:
    pbar = tqdm(train_meta_csv.iterrows(), total=train_meta_csv.shape[0], 
                position = TQDM_POSITION, leave = TQDM_LEAVE)
    for index, row in pbar:
        list_index = list(train_csv[train_csv[IMAGE_NAME] == row[IMAGE_NAME]].index)
        for index in list_index:
            train_csv.loc[index,'meta_image_width'] = row['dim1']
            train_csv.loc[index,'meta_image_height'] = row['dim0']
    print(train_csv.head())

In [None]:
if not META_PREPROCESSING:
    train_csv['x_min'] = train_csv.apply(lambda row: (row.x_min)/row.meta_image_width, axis =1)
    train_csv['y_min'] = train_csv.apply(lambda row: (row.y_min)/row.meta_image_height, axis =1)

    train_csv['x_max'] = train_csv.apply(lambda row: (row.x_max)/row.meta_image_width, axis =1)
    train_csv['y_max'] = train_csv.apply(lambda row: (row.y_max)/row.meta_image_height, axis =1)
else:
    train_csv['x_min'] = train_csv.apply(lambda row: (row.x_min)/IMAGE_SIZE_W, axis =1)
    train_csv['y_min'] = train_csv.apply(lambda row: (row.y_min)/IMAGE_SIZE_H, axis =1)

    train_csv['x_max'] = train_csv.apply(lambda row: (row.x_max)/IMAGE_SIZE_W, axis =1)
    train_csv['y_max'] = train_csv.apply(lambda row: (row.y_max)/IMAGE_SIZE_H, axis =1)

train_csv['x_mid'] = train_csv.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
train_csv['y_mid'] = train_csv.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

train_csv['width'] = train_csv.apply(lambda row: (row.x_max-row.x_min), axis =1)
train_csv['height'] = train_csv.apply(lambda row: (row.y_max-row.y_min), axis =1)

train_csv['area'] = train_csv['width']*train_csv['height']
    
train_csv.head()

In [None]:
train_csv.to_csv(TRAIN_CSV_SAVE_FILENAME, index=False)

[Go to Top](#0)