# A Semi-automatic Classification of Normal Karyotypes using Convolutional Neural Network Algorithm

- **Chuan Yang**, MD, PhD Student
- Mentor: **Yanyan Zhao**, MD, PhD
- Shengjing Hospital of China Medical University

## Modules

In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow import keras

from os import walk

import time
import datetime

%matplotlib inline

## Samples


## Chormosome Label Convert by using Dictionary

In [2]:
chromConvert = {'chr_01': 1, 
               'chr_02': 2, 
               'chr_03': 3, 
               'chr_04': 4, 
               'chr_05': 5, 
               'chr_06': 6, 
               'chr_07': 7, 
               'chr_08': 8, 
               'chr_09': 9, 
               'chr_10': 10, 
               'chr_11': 11, 
               'chr_12': 12, 
               'chr_13': 13, 
               'chr_14': 14, 
               'chr_15': 15, 
               'chr_16': 16, 
               'chr_17': 17, 
               'chr_18': 18, 
               'chr_19': 19, 
               'chr_20': 20, 
               'chr_21': 21, 
               'chr_22': 22, 
               'chr_x': 0, 
               'chr_y': 23, 
                'chr_18_p11_deletion': 24,
                'chr_18_q10_isochromosome': 25,
                'chr_9_inversion': 26,
                'chr_5_p14_deletion': 27,
                'chr_x_isochromosome': 28,
                'chr_x_p22_cnv': 29, 
                'chr_x_q21_cnv': 30,
                'chr_x_q22_deletion': 31
               }

In [None]:
chromConvert = {1: 'chr_01',
                2: 'chr_02', 
                3: 'chr_03',
                4: 'chr_04',
                5: 'chr_05', 
                6: 'chr_06',
                7: 'chr_07',
                8: 'chr_08',
                9: 'chr_09',
                10: 'chr_10',
                11: 'chr_11',
                12: 'chr_12',
                13: 'chr_13',
                14: 'chr_14',
                15: 'chr_15',
                16: 'chr_16',
                17: 'chr_17',
                18: 'chr_18',
                19: 'chr_19',
                20: 'chr_20', 
                21: 'chr_21', 
                22: 'chr_22', 
                0: 'chr_x', 
                23: 'chr_y', 
                24:  'chr_18_p11_deletion',
                25: 'chr_18_q10_isochromosome',
                26: 'chr_9_inversion',
                27: 'chr_5_p14_deletion',
                28: 'chr_x_isochromosome',
                29: 'chr_x_p22_cnv', 
                30: 'chr_x_q21_cnv',
                31: 'chr_x_q22_deletion'
               }

## File Import

In [3]:
# pathBase = 'C:\\Users\\Chuan\\OneDrive\\Dowrun\\Database\\PhD\\KaryoTypes\\Arrangement\\'
# pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement\\'
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement\\'
# ///////////////////////////////////////////////
# Merged Database
pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\'
#pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\'

theWhole = {}


f = []
f_01 = []
mypath_01 = pathBase + 'chr_01'
for (dirpath, dirnames, filenames) in walk(mypath_01):
    f.extend(filenames)
for l in f:
    f_01.append(mypath_01 + '\\' + l) 
        
f = []
f_02 = []
mypath_02 = pathBase + 'chr_02'
for (dirpath, dirnames, filenames) in walk(mypath_02):
    f.extend(filenames)
for l in f:
    f_02.append(mypath_02 + '\\' + l)    
        
f = []
f_03 = []
mypath_03 = pathBase + 'chr_03'
for (dirpath, dirnames, filenames) in walk(mypath_03):
    f.extend(filenames)
for l in f:
    f_03.append(mypath_03 + '\\' + l)    
        
f = []
f_04 = []
mypath_04 = pathBase + 'chr_04'
for (dirpath, dirnames, filenames) in walk(mypath_04):
    f.extend(filenames)
for l in f:
    f_04.append(mypath_04 + '\\' + l)    
        
f = []
f_05 = []
mypath_05 = pathBase + 'chr_05'
for (dirpath, dirnames, filenames) in walk(mypath_05):
    f.extend(filenames)
for l in f:
    f_05.append(mypath_05 + '\\' + l)    
        
f = []
f_06 = []
mypath_06 = pathBase + 'chr_06'
for (dirpath, dirnames, filenames) in walk(mypath_06):
    f.extend(filenames)
for l in f:
    f_06.append(mypath_06 + '\\' + l)    
    
f = []
f_07 = []
mypath_07 = pathBase + 'chr_07'
for (dirpath, dirnames, filenames) in walk(mypath_07):
    f.extend(filenames)
for l in f:
    f_07.append(mypath_07 + '\\' + l)    
        
f = []
f_08 = []
mypath_08 = pathBase + 'chr_08'
for (dirpath, dirnames, filenames) in walk(mypath_08):
    f.extend(filenames)
for l in f:
    f_08.append(mypath_08 + '\\' + l)    
        
f = []
f_09 = []
mypath_09 = pathBase + 'chr_09'
for (dirpath, dirnames, filenames) in walk(mypath_09):
    f.extend(filenames)
for l in f:
    f_09.append(mypath_09 + '\\' + l)    
    
f = []
f_10 = []
mypath_10 = pathBase + 'chr_10'
for (dirpath, dirnames, filenames) in walk(mypath_10):
    f.extend(filenames)
for l in f:
    f_10.append(mypath_10 + '\\' + l)    
    
f = []
f_11 = []
mypath_11 = pathBase + 'chr_11'
for (dirpath, dirnames, filenames) in walk(mypath_11):
    f.extend(filenames)
for l in f:
    f_11.append(mypath_11 + '\\' + l)    
    
    
f = []
f_12 = []
mypath_12 = pathBase + 'chr_12'
for (dirpath, dirnames, filenames) in walk(mypath_12):
    f.extend(filenames)
for l in f:
    f_12.append(mypath_12 + '\\' + l)    
    
    
f = []
f_13 = []
mypath_13 = pathBase + 'chr_13'
for (dirpath, dirnames, filenames) in walk(mypath_13):
    f.extend(filenames)
for l in f:
    f_13.append(mypath_13 + '\\' + l)    
    
    
f = []
f_14 = []
mypath_14 = pathBase + 'chr_14'
for (dirpath, dirnames, filenames) in walk(mypath_14):
    f.extend(filenames)
for l in f:
    f_14.append(mypath_14 + '\\' + l)    
    

    
f = []
f_15 = []
mypath_15 = pathBase + 'chr_15'
for (dirpath, dirnames, filenames) in walk(mypath_15):
    f.extend(filenames)
for l in f:
    f_15.append(mypath_15 + '\\' + l)    
    
        
f = []
f_16 = []
mypath_16 = pathBase + 'chr_16'
for (dirpath, dirnames, filenames) in walk(mypath_16):
    f.extend(filenames)
for l in f:
    f_16.append(mypath_16 + '\\' + l)    
    
        
f = []
f_17 = []
mypath_17 = pathBase + 'chr_17'
for (dirpath, dirnames, filenames) in walk(mypath_17):
    f.extend(filenames)
for l in f:
    f_17.append(mypath_17 + '\\' + l)    
    
        
f = []
f_18 = []
mypath_18 = pathBase + 'chr_18'
for (dirpath, dirnames, filenames) in walk(mypath_18):
    f.extend(filenames)
for l in f:
    f_18.append(mypath_18 + '\\' + l)    
    
        
f = []
f_19 = []
mypath_19 = pathBase + 'chr_19'
for (dirpath, dirnames, filenames) in walk(mypath_19):
    f.extend(filenames)
for l in f:
    f_19.append(mypath_19 + '\\' + l)    
    
        
f = []
f_20 = []
mypath_20 = pathBase + 'chr_20'
for (dirpath, dirnames, filenames) in walk(mypath_20):
    f.extend(filenames)
for l in f:
    f_20.append(mypath_20 + '\\' + l)    
   
        
f = []
f_21 = []
mypath_21 = pathBase + 'chr_21'
for (dirpath, dirnames, filenames) in walk(mypath_21):
    f.extend(filenames)
for l in f:
    f_21.append(mypath_21 + '\\' + l)    
   
        
f = []
f_22 = []
mypath_22 = pathBase + 'chr_22'
for (dirpath, dirnames, filenames) in walk(mypath_22):
    f.extend(filenames)
for l in f:
    f_22.append(mypath_22 + '\\' + l)    
   
f = []
f_x = []
mypath_x = pathBase + 'chr_x'
for (dirpath, dirnames, filenames) in walk(mypath_x):
    f.extend(filenames)
for l in f:
    f_x.append(mypath_x + '\\' + l)    
   
f = []
f_y = []
mypath_y = pathBase + 'chr_y'
for (dirpath, dirnames, filenames) in walk(mypath_y):
    f.extend(filenames)
for l in f:
    f_y.append(mypath_y + '\\' + l)  
    
# ///////// Abnormal ones //////////////

f = []
f_18_p11_deletion = []
mypath_18_p11_deletion = pathBase + 'chr_18_p11_deletion'
for (dirpath, dirnames, filenames) in walk(mypath_18_p11_deletion):
    f.extend(filenames)
for l in f:
    f_18_p11_deletion.append(mypath_18_p11_deletion + '\\' + l)  


f = []
f_18_q10_isochromosome = []
mypath_18_q10_isochromosome = pathBase + 'chr_18_q10_isochromosome'
for (dirpath, dirnames, filenames) in walk(mypath_18_q10_isochromosome):
    f.extend(filenames)
for l in f:
    f_18_q10_isochromosome.append(mypath_18_q10_isochromosome + '\\' + l)  
    
# ///////// Abnormal ones //////////////

f = []
f_9_inversion = []
mypath_9_inversion = pathBase + 'chr_9_inversion'
for (dirpath, dirnames, filenames) in walk(mypath_9_inversion):
    f.extend(filenames)
for l in f:
    f_9_inversion.append(mypath_9_inversion + '\\' + l)  
    

f = []
f_5_p14_deletion = []
mypath_5_p14_deletion = pathBase + 'chr_5_p14_deletion'
for (dirpath, dirnames, filenames) in walk(mypath_5_p14_deletion):
    f.extend(filenames)
for l in f:
    f_5_p14_deletion.append(mypath_5_p14_deletion + '\\' + l)  
    

f = []
f_x_isochromosome = []
mypath_x_isochromosome = pathBase + 'chr_x_isochromosome'
for (dirpath, dirnames, filenames) in walk(mypath_x_isochromosome):
    f.extend(filenames)
for l in f:
    f_x_isochromosome.append(mypath_x_isochromosome + '\\' + l)  
    
    
f = []
f_x_p22_cnv = []
mypath_x_p22_cnv = pathBase + 'chr_x_p22_cnv'
for (dirpath, dirnames, filenames) in walk(mypath_x_p22_cnv):
    f.extend(filenames)
for l in f:
    f_x_p22_cnv.append(mypath_x_p22_cnv + '\\' + l)
    
    
f = []
f_x_q21_cnv = []
mypath_x_q21_cnv = pathBase + 'chr_x_q21_cnv'
for (dirpath, dirnames, filenames) in walk(mypath_x_q21_cnv):
    f.extend(filenames)
for l in f:
    f_x_q21_cnv.append(mypath_x_q21_cnv + '\\' + l)  
    
    
f = []
f_x_q22_deletion = []
mypath_x_q22_deletion = pathBase + 'chr_x_q22_deletion'
for (dirpath, dirnames, filenames) in walk(mypath_x_q22_deletion):
    f.extend(filenames)
for l in f:
    f_x_q22_deletion.append(mypath_x_q22_deletion + '\\' + l)      

    
theWhole['chr_01'] = f_01
theWhole['chr_02'] = f_02
theWhole['chr_03'] = f_03
theWhole['chr_04'] = f_04
theWhole['chr_05'] = f_05
theWhole['chr_06'] = f_06
theWhole['chr_07'] = f_07
theWhole['chr_08'] = f_08
theWhole['chr_09'] = f_09
theWhole['chr_10'] = f_10
theWhole['chr_11'] = f_11
theWhole['chr_12'] = f_12
theWhole['chr_13'] = f_13
theWhole['chr_14'] = f_14
theWhole['chr_15'] = f_15
theWhole['chr_16'] = f_16
theWhole['chr_17'] = f_17
theWhole['chr_18'] = f_18
theWhole['chr_19'] = f_19
theWhole['chr_20'] = f_20
theWhole['chr_21'] = f_21
theWhole['chr_22'] = f_22
theWhole['chr_x'] = f_x
theWhole['chr_y'] = f_y

theWhole['chr_18_p11_deletion'] = f_18_p11_deletion
theWhole['chr_18_q10_isochromosome'] = f_18_q10_isochromosome

theWhole['chr_9_inversion'] = f_9_inversion

theWhole['chr_5_p14_deletion'] = f_5_p14_deletion    

theWhole['chr_x_isochromosome'] = f_x_isochromosome
theWhole['chr_x_p22_cnv'] = f_x_p22_cnv
theWhole['chr_x_q21_cnv'] = f_x_q21_cnv
theWhole['chr_x_q22_deletion'] = f_x_q22_deletion

In [4]:
theWhole['chr_x']

['I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0003.K.jpg.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0004.K.jpg.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0006.K.jpg.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0008.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0009.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0010.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0011.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0012.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0014.K.X.bmp',
 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\chr_x\\W17068~A.0016.K.X.bmp'

In [5]:
type(theWhole['chr_x'])

list

In [6]:
len(theWhole)

32

In [7]:
type(theWhole)

dict

In [8]:
theWhole.keys()

dict_keys(['chr_01', 'chr_02', 'chr_03', 'chr_04', 'chr_05', 'chr_06', 'chr_07', 'chr_08', 'chr_09', 'chr_10', 'chr_11', 'chr_12', 'chr_13', 'chr_14', 'chr_15', 'chr_16', 'chr_17', 'chr_18', 'chr_19', 'chr_20', 'chr_21', 'chr_22', 'chr_x', 'chr_y', 'chr_18_p11_deletion', 'chr_18_q10_isochromosome', 'chr_9_inversion', 'chr_5_p14_deletion', 'chr_x_isochromosome', 'chr_x_p22_cnv', 'chr_x_q21_cnv', 'chr_x_q22_deletion'])

## Cross Validation

### KFold

#### Let's say split every class into 7 folds

In [48]:
kfold = KFold(7, True, 1)

# 分成7个子集，每次6个子集用于训练，1个子集用于测试

<generator object _BaseKFold.split at 0x0000029E3AC0B318>

In [None]:
generator_kFold_chr_01 = kfold.split(theWhole['chr_01'])

# 按每个染色体进行split，产生“生成器”

for train, test in generator_kFold_chr_01:
    print('Train: ', train, 'Test: ', test)
    X_train_name = []
    X_test_name = []
    
    i = 0
    for item in train:
        X_train_name.append(theWhole['chr_01'][item])
    for item in test:
        X_test_name.append(theWhole['chr_01'][item])

In [65]:
X_train_name_chr_01 = []
X_test_name_chr_01 = []



for train, test in kfold.split(theWhole['chr_01']):
    
    split_method_number = 0
    print('Split Method No. ', split_method_number)
    
    print('Train: ', train, 'Test: ', test)
    
    # train和test的值是7个split分法的每个分法的list
    
    X_train_name_chr_01[split_method_number] = []
    X_train_name_chr_01[split_method_number] = []
    
    
    
    
    for split_method_train in train:
        
        # split_method_train的值是每个split方法，其值为图像序号
        X_train_name_chr_01[split_method_number].append(theWhole['chr_01'][split_method_train])
        
        
    for split_method_test in test:
        
        # split_method_test的值是每个split方法，其值为图像序号
        X_test_name_chr_01[split_method_number].append(theWhole['chr_01'][split_method_test])
        
    split_method_number = split_method_number + 1

Train:  [ 0  1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 20 21 22 23 24 25
 27 28 29 30 31 32 35 36 37 38 39 41 42 43 45 46 47 48 49 50 51 52 53 56
 57 58 59 61 62 63 64 65 66 67 68 70 71 72] Test:  [10 19 26 33 34 40 44 54 55 60 69]
Train:  [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 16 17 18 19 20 21 22 23 24 25
 26 28 29 30 31 32 33 34 36 37 38 40 41 42 44 45 46 48 49 50 51 52 54 55
 56 57 58 60 61 62 63 64 65 66 68 69 70 72] Test:  [ 2 15 27 35 39 43 47 53 59 67 71]
Train:  [ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22 23 24 25
 26 27 28 29 30 32 33 34 35 37 39 40 41 42 43 44 45 46 47 48 49 50 51 53
 54 55 59 60 61 62 63 64 66 67 69 70 71 72] Test:  [ 3 21 31 36 38 52 56 57 58 65 68]
Train:  [ 1  2  3  4  5  6  9 10 11 12 13 14 15 16 18 19 20 21 22 23 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 47 49 50 51 52 53 54 55
 56 57 58 59 60 62 63 64 65 67 68 69 70 71 72] Test:  [ 0  7  8 17 24 45 46 48 61 66]
Train:  [ 0  1  2  3  5  6  7  8  9 10 1

In [66]:
len(X_train_name)

63

In [67]:
len(X_test_name)

10

In [64]:
len(theWhole['chr_01'])

73