# Facilitated Machine Learning Models for Karyotyping in the Patients with Chromosomal Abnormalities: Retrospective Study

- **Chuan Yang**, MD, PhD Student
- Mentor: **Yanyan Zhao**, MD, PhD
- Shengjing Hospital of China Medical University

# 0. Modules

In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import KFold

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow import keras

from os import walk

import time
import datetime

import json

%matplotlib inline

# Chr 09 vs Chr 09 Inversion

# 1. Samples


## 1.2. File Import

### 1.2.1. Filename Assignments

In [2]:
# pathBase = 'C:\\Users\\Chuan\\OneDrive\\Dowrun\\Database\\PhD\\KaryoTypes\\Arrangement\\'
# pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement\\'
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement\\'
# ///////////////////////////////////////////////
# Merged Database
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\'！！！
pathBase ='F:\\MyProject\\MachineLearning\\Mydata\\'
#pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\'

theWhole = {}


f = []
f_09 = []
mypath_09 = pathBase + 'chr_09'
for (dirpath, dirnames, filenames) in walk(mypath_09):
    f.extend(filenames)
for l in f:
    f_09.append(mypath_09 + '\\' + l)    
    
# ///////// Abnormal ones //////////////

f = []
f_9_inversion = []
mypath_9_inversion = pathBase + 'chr_9_inversion'
for (dirpath, dirnames, filenames) in walk(mypath_9_inversion):
    f.extend(filenames)
for l in f:
    f_9_inversion.append(mypath_9_inversion + '\\' + l)  
    
theWhole['chr_09'] = f_09    
theWhole['chr_9_inversion'] = f_9_inversion

In [3]:
len(theWhole)

2

In [4]:
type(theWhole)

dict

In [5]:
theWhole.keys()

dict_keys(['chr_09', 'chr_9_inversion'])

# 2. Cross Validation

## 2.1. KFold

- Let's say split every class into 5 folds

In [6]:
# kfold = KFold(5, True, 1)！！！
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
# 分成7个子集，每次6个子集用于训练，1个子集用于测试
# 每种分法进行一次训练和测试的迭代，总共8次迭代。

## 2.2. Assignment of filename which has been splitted randomly

In [7]:
X_train_name = {}
X_test_name = {}

file_k_fold = open('KFold.txt', 'w')

# 赋值两个接收文件路径的dictionary，其第一个key值为染色体或异常核型的名称，第二个为迭代的序号

for chrNo in theWhole.keys():

    X_train_name[chrNo] = {}
    X_test_name[chrNo] = {}
    
    # chrNo为染色体号或异常核型号，在此内部再定义迭代次数

    split_method_number = 0
    
    # 赋值每次split分法的序号值

    generator_kFold = kfold.split(theWhole[chrNo])
    
    # 依染色体或异常核型的类型进行split，因为每个类别的样本量不均衡，以每个类别进行split
    # 赋值一个generator对象，以下对generator进行迭代。
    
    print('Chromosome/Abnormality: ', chrNo)
    file_k_fold.write('Chromosome/Abnormality: %s\n' % chrNo)

    for train, test in generator_kFold:    
        
        # 循环产生train和test集

        print('Split Method No. ', split_method_number)
        file_k_fold.write('Split Method No. %s\n' % split_method_number)

        print('Train: ', train, 'Test: ', test, '\n')
        file_k_fold.write('Train: %s\n' % train)
        file_k_fold.write('Test: %s\n\n' % test)

        # train和test的值是7个split分法的每个分法的list

        X_train_name[chrNo][split_method_number] = []
        X_test_name[chrNo][split_method_number] = []


        for split_method_train in train:
            
            # train 为训练集list中的序号值

            # split_method_train的值是每个split方法，其值为图像序号
            
            X_train_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_train])
            
            # 将训练集那个序号的图像的文件path和文件名赋值给X_train_name这个二维dictionary


        for split_method_test in test:

            # split_method_test的值是每个split方法，其值为图像序号
            
            X_test_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_test])
            
            # 同样将测试集的路径和文件名赋值给X_test_name

        split_method_number = split_method_number + 1
        
        # Split分法序号自加
    file_k_fold.write('\n')
        
file_k_fold.close()

Chromosome/Abnormality:  chr_09
Split Method No.  0
Train:  [  0   1   2   3   7   8   9  10  12  13  14  15  16  18  19  20  21  22
  23  24  25  26  27  28  30  32  33  35  36  37  38  39  40  41  42  43
  44  45  46  48  49  50  51  52  53  54  55  56  57  58  59  60  61  63
  64  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84
  86  87  88  89  91  93  94  95  96  97  98  99 100 101 103 104 105 106
 108 109 111 112 113 114 115 116 118 120 121 122 123 124 125 126 127 129
 130 131 132 133 134 136 137 138 139 140 141 143 144 145 148 149 150 151
 152 153 154 155 156 157 158 159 160 161 163 166 167 169 170 171 172 174
 175 176 177 178 180 181 182 183 184 185 186 188 190 191 193 194 195 196
 197 198 199 200 202 203 204 205 206 207 208 209 210 211 212 215 216 217
 218 219 220 222 224 225 226 227 228 229 230 231 232 234 235 236 237 239
 240 241 242 243 244 245 246 247 248 251 252 253 254 255 256 258 259 261
 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277

In [8]:
X_train_name

{'chr_09': {0: ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A30.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A49.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A20.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A40.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.4~A49.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A13.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A16.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A27.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.13~A28.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.14~A18.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.1~A31.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.2~A18.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.2~A40.K

# Save the Filenames of Train and Test to a File

In [9]:
file_X_train_name = open('data_X_train_name.json', 'w')
json.dump(X_train_name, file_X_train_name)
file_X_train_name.close()

In [10]:
file_X_test_name = open('data_X_test_name.json', 'w')
json.dump(X_test_name, file_X_test_name)
file_X_test_name.close()

In [11]:
file_theWhole = open('data_theWhole.json', 'w')
json.dump(theWhole, file_theWhole)
file_theWhole.close()

# Read the File to Acquire the Filename

In [12]:
with open('data_X_train_name.json') as json_file:
    X_train_name = json.load(json_file)
X_train_name

{'chr_09': {'0': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A30.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A49.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A20.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A40.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.4~A49.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A13.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A16.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A27.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.13~A28.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.14~A18.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.1~A31.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.2~A18.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.2~A40

In [13]:
with open('data_X_test_name.json') as json_file:
    X_test_name = json.load(json_file)
X_test_name

{'chr_09': {'0': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.14~A16.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.2~A23.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.2~A27.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.13~A18.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.3~A9.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171787.14~A50.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171787.3~A16.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\172086.13~A8.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\172997.2~A46.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\173506.2~A15.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\173506.2~A35.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\173506.3~A26.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\173506.3~A27.K_9R.p

In [14]:
with open('data_theWhole.json') as json_file:
    theWhole = json.load(json_file)
theWhole

{'chr_09': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A30.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.12~A49.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A20.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.13~A40.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.14~A16.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.2~A23.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.2~A27.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171303.4~A49.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A13.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A16.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.12~A27.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.13~A18.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\171344.13~A28.K_9R.png',
  'F: