<a href="https://colab.research.google.com/github/brianfish888/Skin_Cance/blob/main/Project3_Skin_Cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 資料來源
Skin Cancer MNIST: HAM10000 https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000

### 資料準備

In [None]:
#keras.utils: 做one-hot encoding用
#sklearn.model_selection: 分割訓練集和測試集
#os: 用來建立檔案、刪除檔案
#PIL: (圖像處理庫)匯入圖像
#seed: 設定種子，使每次隨機產生的資料有相同結果。可將數字改成自己的學號(或其他數字)
import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import os
from PIL import Image
np.random.seed(409570389)

In [None]:
#7項皮膚疾病簡稱與全名
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
pd.Categorical(lesion_type_dict).codes

array([5, 4, 2, 1, 0, 6, 3], dtype=int8)

In [None]:
!pip uninstall gdown -y && pip install gdown
!gdown -V

Found existing installation: gdown 4.4.0
Uninstalling gdown-4.4.0:
  Successfully uninstalled gdown-4.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
Successfully installed gdown-4.6.0
gdown 4.6.0 at /usr/local/lib/python3.8/dist-packages


In [None]:
# Download from Google Drive
import gdown
url = 'https://drive.google.com/uc?id=1kklF0GDZ-4Vh52MIdTexky6Bqzek7S-c'
output = 'project03.zip'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1kklF0GDZ-4Vh52MIdTexky6Bqzek7S-c
To: /content/project03.zip
100%|██████████| 26.6M/26.6M [00:00<00:00, 78.7MB/s]


'project03.zip'

In [None]:
!unzip project03.zip

Archive:  project03.zip
  inflating: project3_test.csv       
  inflating: project3_train.csv      


In [None]:
#讀取影像資料，28*28*3個像素值欄位(pixel0000-pixel2351) + 1個分類類別欄位label
load_img = pd.read_csv('project3_train.csv')

In [None]:
#列出data的標籤
load_img.head()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
0,30,15,20,35,19,27,94,69,73,152,...,22,9,13,11,2,4,9,1,0,0
1,1,0,0,7,1,5,103,56,68,192,...,127,72,74,24,5,6,0,1,1,0
2,129,91,92,182,145,145,205,169,168,189,...,64,39,41,5,2,6,2,2,1,0
3,9,8,8,11,10,10,10,9,9,24,...,11,9,10,8,7,7,5,5,5,0
4,26,13,19,25,10,17,24,6,5,23,...,22,6,9,27,9,10,23,5,6,0


In [None]:
#檢查讀取圖片的大小與數量
load_img.shape

(8008, 2353)

In [None]:
load_img.iloc[: , :-1].values

array([[ 30,  15,  20, ...,   9,   1,   0],
       [  1,   0,   0, ...,   0,   1,   1],
       [129,  91,  92, ...,   2,   2,   1],
       ...,
       [127, 101, 108, ..., 121, 108, 125],
       [157,  82,  86, ..., 210, 126, 130],
       [176, 149, 166, ..., 175, 142, 159]])

In [None]:
#iloc選取特定範圍，讀取種類編號
X_img , y_label = load_img.iloc[: , :-1].values , load_img.iloc[: , -1].values

In [None]:
#將串列轉成矩陣
X_img_train = np.asarray(X_img.tolist())

#將一維的數據，轉換成三維(長*寬*RGB三色)
X_img_train=X_img_train.reshape(X_img_train.shape[0],28,28,3)

In [None]:
#檢查學習資料的照片數量、尺寸大小、維度
print("train data:",'images:',X_img_train.shape," labels:",y_label.shape)

train data: images: (8008, 28, 28, 3)  labels: (8008,)


In [None]:
#標準化: 同除255(因為image的數字是0~255)
X_img_train_normalize = X_img_train.astype('float32') / 255.0

In [None]:
#使用np_utils.to_categorical()傳入各參數的label標籤欄位，再執行OneHot encoding (轉成0或1的組合)
y_label_train_OneHot = np_utils.to_categorical(y_label)

In [None]:
#檢查標籤總共有多少種分類
#這裡是共8008筆資料，每筆是7個0或1的組合
y_label_train_OneHot.shape

(8008, 7)

### 建立與訓練CNN模型

In [None]:
#匯入keras中的Sequential、layers模組(Dense、 Dropout、 Activation、 Flatten、Conv2D、 MaxPooling2D、 ZeroPadding2D)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D

In [None]:
# Design your CNN model
model_cnn = Sequential()
model_cnn.add(Conv2D(filters=64,kernel_size=(5, 5),input_shape=(X_img_train.shape[1], X_img_train.shape[2],3),activation='relu', padding='same'))

In [None]:
model_cnn.add(Conv2D(filters=64, kernel_size=(5, 5),activation='relu', padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Conv2D(filters=128, kernel_size=(5, 5),activation='relu', padding='same'))
model_cnn.add(Conv2D(filters=128, kernel_size=(5, 5),activation='relu', padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Conv2D(filters=256, kernel_size=(5, 5), activation='relu', padding='same'))
model_cnn.add(Conv2D(filters=256, kernel_size=(5, 5), activation='relu', padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Conv2D(filters=512, kernel_size=(5, 5), activation='relu', padding='same'))
model_cnn.add(Conv2D(filters=512, kernel_size=(5, 5), activation='relu', padding='same'))
model_cnn.add(MaxPooling2D(pool_size=(2, 2)))
model_cnn.add(Flatten())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dropout(0.25))
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dropout(0.25))
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.25))
model_cnn.add(Dense(32, activation='relu'))
model_cnn.add(Dropout(0.25))
model_cnn.add(Dense(7, activation='softmax'))

In [None]:
model_cnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 28, 28, 64)        4864      
                                                                 
 conv2d_9 (Conv2D)           (None, 28, 28, 64)        102464    
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 14, 14, 64)       0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 14, 14, 128)       204928    
                                                                 
 conv2d_11 (Conv2D)          (None, 14, 14, 128)       409728    
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 7, 7, 128)        0         
 2D)                                                  

In [None]:
model_cnn.compile(loss='categorical_crossentropy',
       optimizer='adam', metrics=['categorical_accuracy'])

In [None]:
num_epoch=10
batch_size=32
train_history=model_cnn.fit(X_img_train_normalize, y_label_train_OneHot,
                        validation_split=0.2,
                        epochs=num_epoch, batch_size=batch_size, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# 使用最後的模型進行測試資料預測
load_test_img = pd.read_csv('project3_test.csv')
img_test = load_test_img.values

In [None]:
x_test=img_test.reshape(img_test.shape[0],28,28,3)
x_test_normalize = x_test.astype('float32') / 255.0

In [None]:
df_submit = pd.DataFrame([], columns=['Id', 'Label'])
df_submit['Id'] = [f'{i:04d}' for i in range(len(x_test_normalize))]
df_submit['Label'] = np.argmax(model_cnn.predict(x_test_normalize), axis=-1)



In [None]:
df_submit.to_csv('submission_CNN.csv', index=None)