# Facilitated Machine Learning Models for Karyotyping in the Patients with Chromosomal Abnormalities: Retrospective Study

- **Chuan Yang**, MD, PhD Student
- Mentor: **Yanyan Zhao**, MD, PhD
- Shengjing Hospital of China Medical University

# 0. Modules

In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import KFold

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow import keras

from os import walk

import time
import datetime

import json

%matplotlib inline



# 1. Samples


## 1.2. File Import

### 1.2.1. Filename Assignments

In [2]:
# pathBase = 'C:\\Users\\Chuan\\OneDrive\\Dowrun\\Database\\PhD\\KaryoTypes\\Arrangement\\'
# pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement\\'
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement\\'
# ///////////////////////////////////////////////
# Merged Database
pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\'

#pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\'

theWhole = {}

f = []
f_x = []
mypath_x = pathBase + 'chr_x'
for (dirpath, dirnames, filenames) in walk(mypath_x):
    f.extend(filenames)
for l in f:
    f_x.append(mypath_x + '\\' + l)    
    
# ///////// Abnormal ones //////////////

f = []
f_x_p22_cnv = []
mypath_x_p22_cnv = pathBase + 'chr_x_p22_cnv'
for (dirpath, dirnames, filenames) in walk(mypath_x_p22_cnv):
    f.extend(filenames)
for l in f:
    f_x_p22_cnv.append(mypath_x_p22_cnv + '\\' + l)
    
    
theWhole['chr_x'] = f_x
theWhole['chr_x_p22_cnv'] = f_x_p22_cnv


In [3]:
type(theWhole)

dict

In [4]:
theWhole.keys()

dict_keys(['chr_22', 'chr_x_p22_cnv'])

# 2. Cross Validation

## 2.1. KFold

- Let's say split every class into 7 folds

In [5]:
kfold = KFold(7, True, 1)

# 分成7个子集，每次6个子集用于训练，1个子集用于测试
# 每种分法进行一次训练和测试的迭代，总共8次迭代。

## 2.2. Assignment of filename which has been splitted randomly

In [6]:
X_train_name = {}
X_test_name = {}

file_k_fold = open('KFold.txt', 'w')

# 赋值两个接收文件路径的dictionary，其第一个key值为染色体或异常核型的名称，第二个为迭代的序号

for chrNo in theWhole.keys():

    X_train_name[chrNo] = {}
    X_test_name[chrNo] = {}
    
    # chrNo为染色体号或异常核型号，在此内部再定义迭代次数

    split_method_number = 0
    
    # 赋值每次split分法的序号值

    generator_kFold = kfold.split(theWhole[chrNo])
    
    # 依染色体或异常核型的类型进行split，因为每个类别的样本量不均衡，以每个类别进行split
    # 赋值一个generator对象，以下对generator进行迭代。
    
    print('Chromosome/Abnormality: ', chrNo)
    file_k_fold.write('Chromosome/Abnormality: %s\n' % chrNo)

    for train, test in generator_kFold:    
        
        # 循环产生train和test集

        print('Split Method No. ', split_method_number)
        file_k_fold.write('Split Method No. %s\n' % split_method_number)

        print('Train: ', train, 'Test: ', test, '\n')
        file_k_fold.write('Train: %s\n' % train)
        file_k_fold.write('Test: %s\n\n' % test)

        # train和test的值是7个split分法的每个分法的list

        X_train_name[chrNo][split_method_number] = []
        X_test_name[chrNo][split_method_number] = []


        for split_method_train in train:
            
            # train 为训练集list中的序号值

            # split_method_train的值是每个split方法，其值为图像序号
            
            X_train_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_train])
            
            # 将训练集那个序号的图像的文件path和文件名赋值给X_train_name这个二维dictionary


        for split_method_test in test:

            # split_method_test的值是每个split方法，其值为图像序号
            
            X_test_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_test])
            
            # 同样将测试集的路径和文件名赋值给X_test_name

        split_method_number = split_method_number + 1
        
        # Split分法序号自加
    file_k_fold.write('\n')
        
file_k_fold.close()

Chromosome/Abnormality:  chr_22
Split Method No.  0
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 28 29 30 32 33 34 35 37 38 39 40 41 42 43 44 45 46 47 50 51 52 53
 55 56 57 58 59 61 62 63 64 67 68 69] Test:  [26 27 31 36 48 49 54 60 65 66] 

Split Method No.  1
Train:  [ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 36 37 38 40 41 42 44 45 46 48 49 50 51 54
 56 58 59 60 61 63 64 65 66 67 68 69] Test:  [ 2 35 39 43 47 52 53 55 57 62] 

Split Method No.  2
Train:  [ 0  1  2  4  5  6  7  8  9 11 12 13 14 16 17 18 19 20 22 23 24 25 26 27
 28 29 30 31 32 35 36 37 39 40 41 42 43 44 45 46 47 48 49 50 52 53 54 55
 56 57 60 61 62 63 64 65 66 67 68 69] Test:  [ 3 10 15 21 33 34 38 51 58 59] 

Split Method No.  3
Train:  [ 1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 18 20 21 22 23 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 41 42 43 45 46 47 48 49 50 51 52 53 54
 55 56 57 58 59 60 62 64 65 66 68 6

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-no

In [7]:
X_train_name

{'chr_22': {0: ['D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\

# Save the Filenames of Train and Test to a File

In [8]:
file_X_train_name = open('data_X_train_name.json', 'w')
json.dump(X_train_name, file_X_train_name)
file_X_train_name.close()

In [9]:
file_X_test_name = open('data_X_test_name.json', 'w')
json.dump(X_test_name, file_X_test_name)
file_X_test_name.close()

In [10]:
file_theWhole = open('data_theWhole.json', 'w')
json.dump(theWhole, file_theWhole)
file_theWhole.close()

# Read the File to Acquire the Filename

In [11]:
with open('data_X_train_name.json') as json_file:
    X_train_name = json.load(json_file)
X_train_name

{'chr_22': {'0': ['D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22

In [12]:
with open('data_X_test_name.json') as json_file:
    X_test_name = json.load(json_file)
X_test_name

{'chr_22': {'0': ['D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W21576~A.0010.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W21576~A.0010.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W21587~A.0010.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W21890~A.0005.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w22292~A.0027.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w22292~A.0027.K.jpg.22R.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W22666-1.0002.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w22881~A.0017.K.jpg.22L.bmp',
   'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22

In [13]:
with open('data_theWhole.json') as json_file:
    theWhole = json.load(json_file)
theWhole

{'chr_22': ['D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22L.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21000~A.0001.K.jpg.22R.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22L.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21001~A.0008.K.jpg.22R.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22L.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21002~A.0024.K.jpg.22R.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22L.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\w21008~A.0008.K.jpg.22R.bmp',
  'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\chr_22\\W21011~A.000