In [1]:
import json
import numpy as np
import os
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
import csv
from tqdm import tqdm
import shutil



In [2]:
abnormal_file = "./data/train-abnormal.csv"
acl_file = "./data/train-acl.csv"
meniscus_file = "./data/train-meniscus.csv"

abnormal_df = pd.read_csv(abnormal_file, names=['name', 'abnormal'])
acl_df = pd.read_csv(acl_file, names=['name', 'acl'])
meniscus_df = pd.read_csv(meniscus_file, names=['name', 'meniscus'])

df = pd.concat([abnormal_df, acl_df['acl'],meniscus_df['meniscus']], axis=1)
df


Unnamed: 0,name,abnormal,acl,meniscus
0,0,1,0,0
1,1,1,1,1
2,2,1,0,0
3,3,1,0,1
4,4,1,0,0
...,...,...,...,...
1125,1125,1,0,1
1126,1126,1,0,1
1127,1127,0,0,0
1128,1128,1,0,0


In [3]:
classes = {0 : [0,0,0], 1 : [0,0,1], 2 : [0,1,0], 3 : [0,1,1], 4 : [1,0,0], 5 : [1,0,1], 6 : [1,1,0], 7 : [1,1,1]}
# [abnormal, acl, meniscus]

df['class'] = [[k for k, v in classes.items() 
                        if v == [df['abnormal'].iloc[i], df['acl'].iloc[i], df['meniscus'].iloc[i]]][0]
                        for i in range(df.shape[0])]
df

Unnamed: 0,name,abnormal,acl,meniscus,class
0,0,1,0,0,4
1,1,1,1,1,7
2,2,1,0,0,4
3,3,1,0,1,5
4,4,1,0,0,4
...,...,...,...,...,...
1125,1125,1,0,1,5
1126,1126,1,0,1,5
1127,1127,0,0,0,0
1128,1128,1,0,0,4


In [4]:
all = df.to_numpy()
X = np.ones(all.shape[0])
y = np.array([v[-1] for v in all])
groups = np.array([v[0] for v in all])
# groups
cv = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=2024)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])

TRAIN: [   1    2    4 ... 1127 1128 1129]
  [7 4 4 ... 0 4 6]
 TEST: [   0    3   12   24   36   44   49   59   74   88   91  103  113  124
  139  145  151  162  172  187  201  211  213  225  240  248  251  263
  272  282  296  304  315  325  335  343  358  369  373  380  392  406
  420  422  432  445  457  472  473  484  495  507  517  531  537  548
  552  561  577  585  598  607  614  622  632  646  659  671  675  685
  697  712  716  720  733  747  756  768  780  792  798  805  820  823
  833  845  858  870  871  879  891  905  917  923  931  947  954  969
  981  989 1001 1002 1014 1027 1038 1051 1058 1068 1071 1088 1099 1103
 1108]
  [4 5 0 4 5 0 7 7 0 5 5 4 7 4 4 4 4 5 4 4 6 4 4 0 5 7 5 5 0 6 0 4 7 7 4 5 4
 4 5 7 7 0 0 4 5 4 5 0 0 4 0 5 7 5 4 0 4 5 5 5 6 7 4 4 4 5 4 0 0 0 6 0 5 5
 4 4 4 5 4 6 4 4 0 0 5 0 0 7 7 0 5 4 6 5 5 0 4 7 4 5 7 7 5 0 5 5 4 5 0 7 4
 4 5]
TRAIN: [   0    2    3 ... 1126 1127 1128]
  [4 4 5 ... 5 0 4]
 TEST: [   1   13   25   38   52   61   66   75   93   94  

In [5]:
# check distribution
from collections import Counter
import pandas as pd

def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0 
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

categories = [i for i in range(8)]
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,0,1,2,3,4,5,6,7
training set,19.20%,0.00%,0.00%,0.00%,38.32%,24.07%,7.35%,11.06%
train - fold0,18.98%,0.00%,0.00%,0.00%,39.04%,23.70%,7.57%,10.72%
val - fold0,21.24%,0.00%,0.00%,0.00%,31.86%,27.43%,5.31%,14.16%
train - fold1,19.09%,0.00%,0.00%,0.00%,38.58%,24.41%,7.38%,10.53%
val - fold1,20.18%,0.00%,0.00%,0.00%,35.96%,21.05%,7.02%,15.79%
train - fold2,19.47%,0.00%,0.00%,0.00%,38.64%,24.19%,6.69%,11.01%
val - fold2,16.81%,0.00%,0.00%,0.00%,35.40%,23.01%,13.27%,11.50%
train - fold3,19.08%,0.00%,0.00%,0.00%,38.35%,23.40%,7.57%,11.60%
val - fold3,20.35%,0.00%,0.00%,0.00%,38.05%,30.09%,5.31%,6.19%
train - fold4,18.88%,0.00%,0.00%,0.00%,38.64%,24.39%,7.18%,10.91%


In [6]:
train_fold = []
valid_fold = []

for train_idx, val_idx in cv.split(X, y, groups):
    # print("TRAIN:", groups[train_idx])
    # print(" ", y[train_idx])
    # print(" TEST:", groups[val_idx])
    # print(" ", y[val_idx])
    train_fold.append(groups[train_idx])
    valid_fold.append(groups[val_idx])

valid_fold[0].shape

(113,)

In [7]:
fold_num = 0
train_id = train_fold[fold_num]
valid_id = valid_fold[fold_num]

In [8]:
new_path = "./new_data"
new_train_path = "./new_data/train"
new_valid_path = "./new_data/valid"

train_path = "./data/train"


for plane in os.listdir(train_path, ):

    if plane[0] == '.':
        continue
    print(plane)

    now_path = os.path.join(train_path, plane)
    des_train_path = os.path.join(new_train_path, plane)
    des_valid_path = os.path.join(new_valid_path, plane)
    if not os.path.exists(des_train_path):
        os.makedirs(des_train_path)
    if not os.path.exists(des_valid_path):
        os.makedirs(des_valid_path)

    for num in tqdm(train_id):
        file_name = str(num).zfill(4) + '.npy'
        shutil.copy(
            os.path.join(now_path, file_name),
            os.path.join(des_train_path, file_name)
        )
    for num in tqdm(valid_id):
        file_name = str(num).zfill(4) + '.npy'
        shutil.copy(
            os.path.join(now_path, file_name),
            os.path.join(des_valid_path, file_name)
        )


axial


100%|██████████| 1017/1017 [00:03<00:00, 315.93it/s]
100%|██████████| 113/113 [00:00<00:00, 284.01it/s]


coronal


100%|██████████| 1017/1017 [00:03<00:00, 306.76it/s]
100%|██████████| 113/113 [00:00<00:00, 330.99it/s]


sagittal


100%|██████████| 1017/1017 [00:03<00:00, 293.08it/s]
100%|██████████| 113/113 [00:00<00:00, 334.21it/s]


In [11]:
def name_change(x):
    return str(x).zfill(4)

train_line = [line for line in all if line[0] in train_id]
valid_line = [line for line in all if line[0] in valid_id]
print(len(train_line))
print(len(valid_line))
train_csv = pd.DataFrame(train_line, columns=df.columns)
train_csv['name'] = train_csv['name'].apply(name_change)
valid_csv = pd.DataFrame(valid_line, columns=df.columns)
valid_csv['name'] = valid_csv['name'].apply(name_change)

t_abnormal = train_csv[['name', 'abnormal']]
t_abnormal.to_csv(os.path.join(new_path, "train-abnormal.csv"), header=False, index=False)
t_acl = train_csv[['name', 'acl']]
t_acl.to_csv(os.path.join(new_path, "train-acl.csv"), header=False, index=False)
t_meniscus = train_csv[['name', 'meniscus']]
t_meniscus.to_csv(os.path.join(new_path, "train-meniscus.csv"), header=False, index=False)

v_abnormal = valid_csv[['name', 'abnormal']]
v_abnormal.to_csv(os.path.join(new_path, "valid-abnormal.csv"), header=False, index=False)
v_acl = valid_csv[['name', 'acl']]
v_acl.to_csv(os.path.join(new_path, "valid-acl.csv"), header=False, index=False)
v_meniscus = valid_csv[['name', 'meniscus']]
v_meniscus.to_csv(os.path.join(new_path, "valid-meniscus.csv"), header=False, index=False)

1017
113


In [10]:
test_path = "./data/valid"
new_test_path = "./new_data/test"

test_csv = ["./data/valid-abnormal.csv", "./data/valid-acl.csv", "./data/valid-meniscus.csv"]

for t in test_csv:
    file_name = "test-" + t[13:]
    shutil.copy(
        t,
        os.path.join(new_path, file_name)
    )

for plane in os.listdir(test_path):
    if plane[0] == '.':
        continue
    print(plane)

    now_path = os.path.join(test_path, plane)
    des_test_path = os.path.join(new_test_path, plane)
    if not os.path.exists(des_test_path):
        os.makedirs(des_test_path)
    
    for num in tqdm(os.listdir(now_path)):
        shutil.copy(
            os.path.join(now_path, num),
            os.path.join(des_test_path, num)
        )


axial


100%|██████████| 120/120 [00:00<00:00, 319.53it/s]


coronal


100%|██████████| 120/120 [00:00<00:00, 310.47it/s]


sagittal


100%|██████████| 120/120 [00:00<00:00, 332.76it/s]
