# Facilitated Machine Learning Models for Karyotyping in the Patients with Chromosomal Abnormalities: Retrospective Study

- **Chuan Yang**, MD, PhD Student
- Mentor: **Yanyan Zhao**, MD, PhD
- Shengjing Hospital of China Medical University

# 0. Modules

In [None]:
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sb

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import KFold

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow import keras

from os import walk

import time
import datetime

import json

%matplotlib inline



# Chr 09 vs Chr 09 Inversion

# 1. Samples


## 1.2. File Import

### 1.2.1. Filename Assignments

In [None]:
# pathBase = 'C:\\Users\\Chuan\\OneDrive\\Dowrun\\Database\\PhD\\KaryoTypes\\Arrangement\\'
# pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement\\'
# pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement\\'
# ///////////////////////////////////////////////
# Merged Database
pathBase = 'D:\\Users\\Chuan\\Documents\\Database\\Karyotypes\\Arrangement_Merged\\'

#pathBase = 'I:\\Chuan\\Documents\\MyData\\PhD\\Karyotype\\Arrangement_Merged\\'

theWhole = {}


f = []
f_09 = []
mypath_09 = pathBase + 'chr_09'
for (dirpath, dirnames, filenames) in walk(mypath_09):
    f.extend(filenames)
for l in f:
    f_09.append(mypath_09 + '\\' + l)    
    
# ///////// Abnormal ones //////////////

f = []
f_9_inversion = []
mypath_9_inversion = pathBase + 'chr_9_inversion'
for (dirpath, dirnames, filenames) in walk(mypath_9_inversion):
    f.extend(filenames)
for l in f:
    f_9_inversion.append(mypath_9_inversion + '\\' + l)  
    
theWhole['chr_09'] = f_09    
theWhole['chr_9_inversion'] = f_9_inversion

In [None]:
len(theWhole)

In [None]:
type(theWhole)

In [None]:
theWhole.keys()

# 2. Cross Validation

## 2.1. KFold

- Let's say split every class into 7 folds

In [None]:
kfold = KFold(7, True, 1)

# 分成7个子集，每次6个子集用于训练，1个子集用于测试
# 每种分法进行一次训练和测试的迭代，总共8次迭代。

## 2.2. Assignment of filename which has been splitted randomly

In [None]:
X_train_name = {}
X_test_name = {}

file_k_fold = open('KFold.txt', 'w')

# 赋值两个接收文件路径的dictionary，其第一个key值为染色体或异常核型的名称，第二个为迭代的序号

for chrNo in theWhole.keys():

    X_train_name[chrNo] = {}
    X_test_name[chrNo] = {}
    
    # chrNo为染色体号或异常核型号，在此内部再定义迭代次数

    split_method_number = 0
    
    # 赋值每次split分法的序号值

    generator_kFold = kfold.split(theWhole[chrNo])
    
    # 依染色体或异常核型的类型进行split，因为每个类别的样本量不均衡，以每个类别进行split
    # 赋值一个generator对象，以下对generator进行迭代。
    
    print('Chromosome/Abnormality: ', chrNo)
    file_k_fold.write('Chromosome/Abnormality: %s\n' % chrNo)

    for train, test in generator_kFold:    
        
        # 循环产生train和test集

        print('Split Method No. ', split_method_number)
        file_k_fold.write('Split Method No. %s\n' % split_method_number)

        print('Train: ', train, 'Test: ', test, '\n')
        file_k_fold.write('Train: %s\n' % train)
        file_k_fold.write('Test: %s\n\n' % test)

        # train和test的值是7个split分法的每个分法的list

        X_train_name[chrNo][split_method_number] = []
        X_test_name[chrNo][split_method_number] = []


        for split_method_train in train:
            
            # train 为训练集list中的序号值

            # split_method_train的值是每个split方法，其值为图像序号
            
            X_train_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_train])
            
            # 将训练集那个序号的图像的文件path和文件名赋值给X_train_name这个二维dictionary


        for split_method_test in test:

            # split_method_test的值是每个split方法，其值为图像序号
            
            X_test_name[chrNo][split_method_number].append(theWhole[chrNo][split_method_test])
            
            # 同样将测试集的路径和文件名赋值给X_test_name

        split_method_number = split_method_number + 1
        
        # Split分法序号自加
    file_k_fold.write('\n')
        
file_k_fold.close()

In [None]:
X_train_name

# Save the Filenames of Train and Test to a File

In [None]:
file_X_train_name = open('data_X_train_name.json', 'w')
json.dump(X_train_name, file_X_train_name)
file_X_train_name.close()

In [None]:
file_X_test_name = open('data_X_test_name.json', 'w')
json.dump(X_test_name, file_X_test_name)
file_X_test_name.close()

In [None]:
file_theWhole = open('data_theWhole.json', 'w')
json.dump(theWhole, file_theWhole)
file_theWhole.close()

# Read the File to Acquire the Filename

In [None]:
with open('data_X_train_name.json') as json_file:
    X_train_name = json.load(json_file)
X_train_name

In [None]:
with open('data_X_test_name.json') as json_file:
    X_test_name = json.load(json_file)
X_test_name

In [None]:
with open('data_theWhole.json') as json_file:
    theWhole = json.load(json_file)
theWhole