# Facilitated Machine Learning Models for Karyotyping in the Patients with Chromosomal Abnormalities: Retrospective Study

- **Chuan Yang**, MD, PhD Student
- Mentor: **Yanyan Zhao**, MD, PhD
- Shengjing Hospital of China Medical University

# Chr 09 vs Chr 09 Inversion

# 0. Modules

In [101]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import KFold

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow import keras

from os import walk

import time
import datetime

import json

%matplotlib inline

# 1. Samples


## 1.1. Chormosome Label Convert by using Dictionary

In [104]:
chromConvert = {
               'chr_09': 0, 
               'chr_9_inversion': 1
               }

In [105]:
chromConvert_reverse = {
                0: 'chr_09',
                1: 'chr_9_inversion'
               }

## 1.2. File Import

### 1.2.1. Filename Assignments

In [108]:
# pathBase = 'C:\\Users\\Chuan\\OneDrive\\Dowrun\\Database\\PhD\\KaryoTypes\\Arrangement\\'
# pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement\\'
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement\\'
# ///////////////////////////////////////////////
# Merged Database
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\'！！！
pathBase ='F:\\MyProject\\MachineLearning\\Mydata\\'
#pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\'

theWhole = {}
        
f = []
f_09 = []
mypath_09 = pathBase + 'chr_09'
for (dirpath, dirnames, filenames) in walk(mypath_09):
    f.extend(filenames)
for l in f:
    f_09.append(mypath_09 + '\\' + l)    
    
# ///////// Abnormal ones //////////////

f = []
f_9_inversion = []
mypath_9_inversion = pathBase + 'chr_9_inversion'
for (dirpath, dirnames, filenames) in walk(mypath_9_inversion):
    f.extend(filenames)
for l in f:
    f_9_inversion.append(mypath_9_inversion + '\\' + l)  
    
theWhole['chr_09'] = f_09
theWhole['chr_9_inversion'] = f_9_inversion


In [109]:
theWhole['chr_9_inversion']

['F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.12~A15.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.12~A21.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.12~A30.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.13~A1.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.13~A17.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.13~A22.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.13~A6.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.14~A40.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.2~A23.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.2~A48.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.3~A39.K_9L.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_9_inversion\\78034.4~A36.K_9L.png',
 'F:\\MyPr

In [110]:
type(theWhole['chr_09'])

list

In [111]:
len(theWhole)

2

In [112]:
type(theWhole)

dict

In [113]:
theWhole.keys()

dict_keys(['chr_09', 'chr_9_inversion'])

# 2. Cross Validation

## 2.1. KFold

- Let's say split every class into 5 folds

In [117]:
# kfold = KFold(5, True, 1)！！！
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

# 分成7个子集，每次6个子集用于训练，1个子集用于测试
# 每种分法进行一次训练和测试的迭代，总共8次迭代。

## 2.2. Assignment of filename which has been splitted randomly

In [119]:
X_train_name = {}
X_test_name = {}

file_k_fold = open('KFold.txt', 'w')

# 赋值两个接收文件路径的dictionary，其第一个key值为染色体或异常核型的名称，第二个为迭代的序号

for chrNo in theWhole.keys():

    X_train_name[chrNo] = {}
    X_test_name[chrNo] = {}
    
    # chrNo为染色体号或异常核型号，在此内部再定义迭代次数

    split_method_number = 0
    
    # 赋值每次split分法的序号值

    generator_kFold = kfold.split(theWhole[chrNo])
    
    # 依染色体或异常核型的类型进行split，因为每个类别的样本量不均衡，以每个类别进行split
    # 赋值一个generator对象，以下对generator进行迭代。
    
    print('Chromosome/Abnormality: ', chrNo)
    file_k_fold.write('Chromosome/Abnormality: %s\n' % chrNo)

    for train, test in generator_kFold:    
        
        # 循环产生train和test集

        print('Split Method No. ', split_method_number)
        file_k_fold.write('Split Method No. %s\n' % split_method_number)

        print('Train: ', train, 'Test: ', test, '\n')
        file_k_fold.write('Train: %s\n' % train)
        file_k_fold.write('Test: %s\n\n' % test)

        # train和test的值是7个split分法的每个分法的list

        X_train_name[chrNo][split_method_number] = []
        X_test_name[chrNo][split_method_number] = []


        for split_method_train in train:
            
            # train 为训练集list中的序号值

            # split_method_train的值是每个split方法，其值为图像序号
            
            X_train_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_train])
            
            # 将训练集那个序号的图像的文件path和文件名赋值给X_train_name这个二维dictionary


        for split_method_test in test:

            # split_method_test的值是每个split方法，其值为图像序号
            
            X_test_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_test])
            
            # 同样将测试集的路径和文件名赋值给X_test_name

        split_method_number = split_method_number + 1
        
        # Split分法序号自加
    file_k_fold.write('\n')
        
file_k_fold.close()

Chromosome/Abnormality:  chr_09
Split Method No.  0
Train:  [ 0  1  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26 28 30 32 33 34 36 37 39 41 42 43 44 45 47 49] Test:  [ 2  3 27 29 31 35 38 40 46 48] 

Split Method No.  1
Train:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 20 23 24 25 27 28
 29 30 31 33 34 35 37 38 40 41 43 44 45 46 47 48] Test:  [13 19 21 22 26 32 36 39 42 49] 

Split Method No.  2
Train:  [ 0  1  2  3  5  6  7  8  9 11 12 13 15 16 18 19 20 21 22 25 26 27 28 29
 31 32 34 35 36 37 38 39 40 42 43 44 46 47 48 49] Test:  [ 4 10 14 17 23 24 30 33 41 45] 

Split Method No.  3
Train:  [ 0  2  3  4  5  8  9 10 11 12 13 14 15 16 17 19 21 22 23 24 26 27 29 30
 31 32 33 35 36 37 38 39 40 41 42 43 45 46 48 49] Test:  [ 1  6  7 18 20 25 28 34 44 47] 

Split Method No.  4
Train:  [ 1  2  3  4  6  7 10 13 14 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
 32 33 34 35 36 38 39 40 41 42 44 45 46 47 48 49] Test:  [ 0  5  8  9 11 12 15 16 37 43] 

Chromoso

In [120]:
len(X_train_name['chr_09'])

5

In [121]:
len(X_train_name['chr_09'][0])

40

In [122]:
len(X_test_name['chr_09'][0])

10

In [123]:
X_train_name['chr_09'][1] # 表示第一轮split的方法的1号染色体的训练样本的文件名

['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A15.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A21.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A30.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A1.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A17.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A22.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A6.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.14~A40.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A23.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A48.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.3~A39.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.4~A36.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A19.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr

In [124]:
X_test_name['chr_09'][1]

['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A2.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78142.13~A20.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78142.4~A44.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78220.1~A45.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78220.4~A38.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78230.3~A5.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78451.12~A38.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78451.3~A37.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78834.12~A25.K_9R.png',
 'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\79501.3~A30.K_9R.png']

In [125]:
X_train_name

{'chr_09': {0: ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A15.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A21.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A17.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A22.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A6.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.14~A40.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A23.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A48.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.3~A39.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.4~A36.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A19.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A2.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A26.K_9R.png',
   'F:\\

# Save the Filenames of Train and Test to a File

In [127]:
file_X_train_name = open('data_X_train_name.json', 'w')
json.dump(X_train_name, file_X_train_name)
file_X_train_name.close()

In [128]:
file_X_test_name = open('data_X_test_name.json', 'w')
json.dump(X_test_name, file_X_test_name)
file_X_test_name.close()

In [129]:
file_theWhole = open('data_theWhole.json', 'w')
json.dump(theWhole, file_theWhole)
file_theWhole.close()

# Read the File to Acquire the Filename

In [131]:
with open('data_X_train_name.json') as json_file:
    X_train_name = json.load(json_file)
X_train_name

{'chr_09': {'0': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A15.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A21.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A17.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A22.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A6.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.14~A40.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A23.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A48.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.3~A39.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.4~A36.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A19.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A2.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A26.K_9R.png',
   'F:

In [132]:
with open('data_X_test_name.json') as json_file:
    X_test_name = json.load(json_file)
X_test_name

{'chr_09': {'0': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A30.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A1.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78230.12~A47.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78230.14~A47.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78230.3~A14.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78451.12~A32.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78451.2~A30.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78451.3~A41.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78834.4~A24.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\79501.3~A10.K_9R.png'],
  '1': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A2.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78142.13~A20.K_9R.png',
   'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78142.4~A44.K_9R.png',


In [133]:
with open('data_theWhole.json') as json_file:
    theWhole = json.load(json_file)
theWhole

{'chr_09': ['F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A15.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A21.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.12~A30.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A1.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A17.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A22.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.13~A6.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.14~A40.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A23.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.2~A48.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.3~A39.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78034.4~A36.K_9R.png',
  'F:\\MyProject\\MachineLearning\\Mydata\\chr_09\\78058.3~A19.K_9R.png',
  'F:\\MyProject\\Mach