In [None]:
import os
import pandas as pd 
import shutil
import matplotlib.pyplot as plt
import SimpleITK as sitk
import glob


# Get Dataset Information

In [None]:
!pwd

In [None]:
# find all the data thats available
root_folder = '../../datasets/hecktor2022/'
root_image_folder = os.path.join(root_folder, 'imagesTr')
root_labels_folder = os.path.join(root_folder, 'labelsTr')

In [None]:
print(f'images folder has {len(os.listdir(root_image_folder))} images')
print(f'images folder has {len(os.listdir(root_labels_folder))} labels')


In [None]:
def extract_institute(file_name):
    # Assuming the institute name is the first 4 characters of the file name
    return file_name[:4]

In [161]:
def generate_summary(folder_path, img_type):
    # Initialize empty lists to store file names and institutes
    file_names = []
    institutes = []
    image_spacings = []
    image_sizes = []


    # Traverse the folder and its subdirectories
    for i, file_name in enumerate(os.listdir(folder_path)):
        print(i, ' image ', file_name)
        if file_name.endswith(f'__{img_type}.nii.gz'): # or file_name.endswith('__PT.nii.gz'):
            # Extract the institute name from the file name
            institute_name = extract_institute(file_name)

            # Append file name and institute name to the lists
            file_names.append(file_name)
            institutes.append(institute_name)

            # get image information
            selected_image = sitk.ReadImage(os.path.join(root_image_folder, file_name))
            image_spacings.append(str(selected_image.GetSpacing()))
            image_sizes.append(str(selected_image.GetSize()))

    # Create a DataFrame to store the summary
    summary_df = pd.DataFrame({'File Name': file_names, 
                               'Institute': institutes, 
                               'spacing': image_spacings, 
                               'size': image_sizes})
    
    
    # add type as CT 
    summary_df['modality'] = [img_type for x in range(summary_df.shape[0])]
    
    
    return summary_df

In [95]:
ct_df = generate_summary(root_image_folder, 'CT')

0  image  MDA-111__CT.nii.gz
1  image  CHUM-040__PT.nii.gz
2  image  HGJ-048__CT.nii.gz
3  image  MDA-126__CT.nii.gz
4  image  CHUP-002__PT.nii.gz
5  image  MDA-005__CT.nii.gz
6  image  CHUV-043__PT.nii.gz
7  image  CHUS-048__CT.nii.gz
8  image  CHUP-035__PT.nii.gz
9  image  MDA-032__CT.nii.gz
10  image  CHUM-015__PT.nii.gz
11  image  MDA-173__CT.nii.gz
12  image  MDA-144__CT.nii.gz
13  image  MDA-085__PT.nii.gz
14  image  CHUM-022__PT.nii.gz
15  image  CHUV-016__PT.nii.gz
16  image  MDA-067__CT.nii.gz
17  image  CHUP-060__PT.nii.gz
18  image  MDA-191__PT.nii.gz
19  image  MDA-050__CT.nii.gz
20  image  CHUP-057__PT.nii.gz
21  image  CHUV-021__PT.nii.gz
22  image  CHUM-008__CT.nii.gz
23  image  MDA-098__CT.nii.gz
24  image  HGJ-037__PT.nii.gz
25  image  MDA-159__PT.nii.gz
26  image  HGJ-062__PT.nii.gz
27  image  HGJ-055__PT.nii.gz
28  image  HMR-028__CT.nii.gz
29  image  CHUS-080__CT.nii.gz
30  image  MDA-018__PT.nii.gz
31  image  HGJ-080__CT.nii.gz
32  image  CHUP-028__CT.nii.gz
33  im

264  image  MDA-188__PT.nii.gz
265  image  MDA-049__CT.nii.gz
266  image  CHUV-038__PT.nii.gz
267  image  CHUS-033__CT.nii.gz
268  image  CHUS-061__CT.nii.gz
269  image  CHUS-056__CT.nii.gz
270  image  HGJ-083__PT.nii.gz
271  image  MDA-138__CT.nii.gz
272  image  CHUS-083__PT.nii.gz
273  image  CHUV-008__PT.nii.gz
274  image  CHUS-003__CT.nii.gz
275  image  MDA-079__CT.nii.gz
276  image  CHUP-049__PT.nii.gz
277  image  HGJ-034__CT.nii.gz
278  image  MDA-064__PT.nii.gz
279  image  CHUP-063__CT.nii.gz
280  image  CHUV-015__CT.nii.gz
281  image  MDA-192__CT.nii.gz
282  image  CHUV-022__CT.nii.gz
283  image  MDA-053__PT.nii.gz
284  image  CHUP-054__CT.nii.gz
285  image  CHUM-016__CT.nii.gz
286  image  MDA-170__PT.nii.gz
287  image  MDA-147__PT.nii.gz
288  image  CHUM-021__CT.nii.gz
289  image  MDA-086__CT.nii.gz
290  image  HGJ-029__PT.nii.gz
291  image  CHUP-001__CT.nii.gz
292  image  MDA-006__PT.nii.gz
293  image  CHUP-036__CT.nii.gz
294  image  MDA-031__PT.nii.gz
295  image  CHUV-040__C

528  image  HMR-021__CT.nii.gz
529  image  CHUM-054__CT.nii.gz
530  image  CHUS-089__CT.nii.gz
531  image  MDA-132__PT.nii.gz
532  image  MDA-105__PT.nii.gz
533  image  CHUM-063__CT.nii.gz
534  image  HMR-016__CT.nii.gz
535  image  MDA-185__CT.nii.gz
536  image  CHUV-035__CT.nii.gz
537  image  CHUP-043__CT.nii.gz
538  image  MDA-044__PT.nii.gz
539  image  CHUP-074__CT.nii.gz
540  image  MDA-073__PT.nii.gz
541  image  CHUS-009__PT.nii.gz
542  image  CHUV-002__CT.nii.gz
543  image  MDA-150__PT.nii.gz
544  image  CHUM-036__CT.nii.gz
545  image  MDA-091__CT.nii.gz
546  image  CHUM-001__CT.nii.gz
547  image  MDA-167__PT.nii.gz
548  image  MDA-059__CT.nii.gz
549  image  CHUV-028__PT.nii.gz
550  image  MDA-198__PT.nii.gz
551  image  CHUP-069__PT.nii.gz
552  image  CHUS-100__CT.nii.gz
553  image  CHUS-041__CT.nii.gz
554  image  CHUS-076__CT.nii.gz
555  image  CHUM-049__PT.nii.gz
556  image  CHUS-094__PT.nii.gz
557  image  HGJ-076__CT.nii.gz
558  image  MDA-118__CT.nii.gz
559  image  MDA-119__P

790  image  HGJ-010__PT.nii.gz
791  image  CHUM-018__CT.nii.gz
792  image  CHUS-097__CT.nii.gz
793  image  MDA-038__PT.nii.gz
794  image  CHUS-042__PT.nii.gz
795  image  CHUV-049__CT.nii.gz
796  image  CHUP-008__CT.nii.gz
797  image  MDA-179__PT.nii.gz
798  image  HGJ-017__PT.nii.gz
799  image  CHUS-020__PT.nii.gz
800  image  MDA-153__CT.nii.gz
801  image  HMR-040__PT.nii.gz
802  image  MDA-092__PT.nii.gz
803  image  CHUM-035__PT.nii.gz
804  image  CHUM-002__PT.nii.gz
805  image  MDA-164__CT.nii.gz
806  image  MDA-186__PT.nii.gz
807  image  CHUP-040__PT.nii.gz
808  image  MDA-047__CT.nii.gz
809  image  CHUV-036__PT.nii.gz
810  image  CHUV-001__PT.nii.gz
811  image  MDA-070__CT.nii.gz
812  image  CHUM-057__PT.nii.gz
813  image  MDA-131__CT.nii.gz
814  image  MDA-106__CT.nii.gz
815  image  CHUM-060__PT.nii.gz
816  image  MDA-025__CT.nii.gz
817  image  CHUP-022__PT.nii.gz
818  image  MDA-012__CT.nii.gz
819  image  CHUP-015__PT.nii.gz
820  image  CHUS-068__CT.nii.gz
821  image  MDA-013__PT

In [96]:
ct_df

Unnamed: 0,File Name,Institute,spacing,size,modality
0,MDA-111__CT.nii.gz,MDA-,"(0.9765625, 0.9765625, 5.0)","(512, 512, 195)",CT
1,HGJ-048__CT.nii.gz,HGJ-,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 91)",CT
2,MDA-126__CT.nii.gz,MDA-,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 127)",CT
3,MDA-005__CT.nii.gz,MDA-,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 267)",CT
4,CHUS-048__CT.nii.gz,CHUS,"(1.171875, 1.171875, 3.0)","(512, 512, 134)",CT
...,...,...,...,...,...
519,CHUV-032__CT.nii.gz,CHUV,"(1.3671879768371582, 1.3671879768371582, 3.269...","(512, 512, 335)",CT
520,CHUP-044__CT.nii.gz,CHUP,"(0.9765625, 0.9765625, 1.5)","(512, 512, 652)",CT
521,MDA-182__CT.nii.gz,MDA-,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 91)",CT
522,CHUP-073__CT.nii.gz,CHUP,"(0.9765625, 0.9765625, 1.5)","(512, 512, 568)",CT


In [101]:
ct_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524 entries, 0 to 523
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   File Name  524 non-null    object
 1   Institute  524 non-null    object
 2   spacing    524 non-null    object
 3   size       524 non-null    object
 4   modality   524 non-null    object
dtypes: object(5)
memory usage: 20.6+ KB


In [162]:
pt_df = generate_summary(root_image_folder, 'PT')
pt_df.head()

0  image  MDA-111__CT.nii.gz
1  image  CHUM-040__PT.nii.gz
2  image  HGJ-048__CT.nii.gz
3  image  MDA-126__CT.nii.gz
4  image  CHUP-002__PT.nii.gz
5  image  MDA-005__CT.nii.gz
6  image  CHUV-043__PT.nii.gz
7  image  CHUS-048__CT.nii.gz
8  image  CHUP-035__PT.nii.gz
9  image  MDA-032__CT.nii.gz
10  image  CHUM-015__PT.nii.gz
11  image  MDA-173__CT.nii.gz
12  image  MDA-144__CT.nii.gz
13  image  MDA-085__PT.nii.gz
14  image  CHUM-022__PT.nii.gz
15  image  CHUV-016__PT.nii.gz
16  image  MDA-067__CT.nii.gz
17  image  CHUP-060__PT.nii.gz
18  image  MDA-191__PT.nii.gz
19  image  MDA-050__CT.nii.gz
20  image  CHUP-057__PT.nii.gz
21  image  CHUV-021__PT.nii.gz
22  image  CHUM-008__CT.nii.gz
23  image  MDA-098__CT.nii.gz
24  image  HGJ-037__PT.nii.gz
25  image  MDA-159__PT.nii.gz
26  image  HGJ-062__PT.nii.gz
27  image  HGJ-055__PT.nii.gz
28  image  HMR-028__CT.nii.gz
29  image  CHUS-080__CT.nii.gz
30  image  MDA-018__PT.nii.gz
31  image  HGJ-080__CT.nii.gz
32  image  CHUP-028__CT.nii.gz
33  im

267  image  CHUS-033__CT.nii.gz
268  image  CHUS-061__CT.nii.gz
269  image  CHUS-056__CT.nii.gz
270  image  HGJ-083__PT.nii.gz
271  image  MDA-138__CT.nii.gz
272  image  CHUS-083__PT.nii.gz
273  image  CHUV-008__PT.nii.gz
274  image  CHUS-003__CT.nii.gz
275  image  MDA-079__CT.nii.gz
276  image  CHUP-049__PT.nii.gz
277  image  HGJ-034__CT.nii.gz
278  image  MDA-064__PT.nii.gz
279  image  CHUP-063__CT.nii.gz
280  image  CHUV-015__CT.nii.gz
281  image  MDA-192__CT.nii.gz
282  image  CHUV-022__CT.nii.gz
283  image  MDA-053__PT.nii.gz
284  image  CHUP-054__CT.nii.gz
285  image  CHUM-016__CT.nii.gz
286  image  MDA-170__PT.nii.gz
287  image  MDA-147__PT.nii.gz
288  image  CHUM-021__CT.nii.gz
289  image  MDA-086__CT.nii.gz
290  image  HGJ-029__PT.nii.gz
291  image  CHUP-001__CT.nii.gz
292  image  MDA-006__PT.nii.gz
293  image  CHUP-036__CT.nii.gz
294  image  MDA-031__PT.nii.gz
295  image  CHUV-040__CT.nii.gz
296  image  MDA-112__PT.nii.gz
297  image  HMR-001__CT.nii.gz
298  image  CHUM-043__C

542  image  CHUV-002__CT.nii.gz
543  image  MDA-150__PT.nii.gz
544  image  CHUM-036__CT.nii.gz
545  image  MDA-091__CT.nii.gz
546  image  CHUM-001__CT.nii.gz
547  image  MDA-167__PT.nii.gz
548  image  MDA-059__CT.nii.gz
549  image  CHUV-028__PT.nii.gz
550  image  MDA-198__PT.nii.gz
551  image  CHUP-069__PT.nii.gz
552  image  CHUS-100__CT.nii.gz
553  image  CHUS-041__CT.nii.gz
554  image  CHUS-076__CT.nii.gz
555  image  CHUM-049__PT.nii.gz
556  image  CHUS-094__PT.nii.gz
557  image  HGJ-076__CT.nii.gz
558  image  MDA-118__CT.nii.gz
559  image  MDA-119__PT.nii.gz
560  image  HGJ-077__PT.nii.gz
561  image  CHUM-048__CT.nii.gz
562  image  CHUS-095__CT.nii.gz
563  image  CHUS-077__PT.nii.gz
564  image  CHUS-040__PT.nii.gz
565  image  HGJ-015__PT.nii.gz
566  image  CHUS-101__PT.nii.gz
567  image  CHUS-015__PT.nii.gz
568  image  CHUP-068__CT.nii.gz
569  image  MDA-199__CT.nii.gz
570  image  MDA-058__PT.nii.gz
571  image  CHUS-022__PT.nii.gz
572  image  CHUV-029__CT.nii.gz
573  image  MDA-166_

808  image  MDA-047__CT.nii.gz
809  image  CHUV-036__PT.nii.gz
810  image  CHUV-001__PT.nii.gz
811  image  MDA-070__CT.nii.gz
812  image  CHUM-057__PT.nii.gz
813  image  MDA-131__CT.nii.gz
814  image  MDA-106__CT.nii.gz
815  image  CHUM-060__PT.nii.gz
816  image  MDA-025__CT.nii.gz
817  image  CHUP-022__PT.nii.gz
818  image  MDA-012__CT.nii.gz
819  image  CHUP-015__PT.nii.gz
820  image  CHUS-068__CT.nii.gz
821  image  MDA-013__PT.nii.gz
822  image  CHUS-069__PT.nii.gz
823  image  MDA-024__PT.nii.gz
824  image  CHUP-023__CT.nii.gz
825  image  HMR-014__CT.nii.gz
826  image  HGJ-069__PT.nii.gz
827  image  CHUM-061__CT.nii.gz
828  image  MDA-107__PT.nii.gz
829  image  MDA-130__PT.nii.gz
830  image  CHUM-056__CT.nii.gz
831  image  HMR-023__CT.nii.gz
832  image  MDA-071__PT.nii.gz
833  image  CHUP-041__CT.nii.gz
834  image  MDA-046__PT.nii.gz
835  image  CHUV-037__CT.nii.gz
836  image  MDA-187__CT.nii.gz
837  image  MDA-165__PT.nii.gz
838  image  MDA-093__CT.nii.gz
839  image  CHUM-034__CT.n

Unnamed: 0,File Name,Institute,spacing,size,modality
0,CHUM-040__PT.nii.gz,CHUM,"(5.46875, 5.46875, 3.269923686981201)","(128, 128, 263)",PT
1,CHUP-002__PT.nii.gz,CHUP,"(4.0728302001953125, 4.0728302001953125, 2.026...","(200, 200, 481)",PT
2,CHUV-043__PT.nii.gz,CHUV,"(2.734375, 2.734375, 3.2699999809265137)","(256, 256, 335)",PT
3,CHUP-035__PT.nii.gz,CHUP,"(4.0728302001953125, 4.0728302001953125, 2.026...","(200, 200, 481)",PT
4,CHUM-015__PT.nii.gz,CHUM,"(4.0, 4.0, 4.0)","(144, 144, 87)",PT


**concatenate both data frames**

In [163]:
images_df = pd.concat([ct_df, pt_df])

In [164]:
images_df.groupby(by=['Institute', 'modality']).count()


Unnamed: 0_level_0,Unnamed: 1_level_0,File Name,spacing,size
Institute,modality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHUM,CT,56,56,56
CHUM,PT,56,56,56
CHUP,CT,72,72,72
CHUP,PT,72,72,72
CHUS,CT,72,72,72
CHUS,PT,72,72,72
CHUV,CT,53,53,53
CHUV,PT,53,53,53
HGJ-,CT,55,55,55
HGJ-,PT,55,55,55


**Add Some Columns**

In [165]:
# add spacing x,y,z coordinates
images_df['spacing_x'] = images_df['spacing'].apply(lambda x: x[1:-1].split(',')[0]).astype(float)
images_df['spacing_y'] = images_df['spacing'].apply(lambda x: x[1:-1].split(',')[1]).astype(float)
images_df['spacing_z'] = images_df['spacing'].apply(lambda x: x[1:-1].split(',')[2]).astype(float)

In [166]:
# fix institute
def fix_institute(institute):
    if institute.endswith('-'):
        return institute.split('-')[0]
    return institute

In [167]:
images_df['Institute'] = images_df['Institute'].apply(lambda x: fix_institute(x))

In [168]:
images_df.tail()

Unnamed: 0,File Name,Institute,spacing,size,modality,spacing_x,spacing_y,spacing_z
519,MDA-157__PT.nii.gz,MDA,"(5.46875, 5.46875, 3.2701196670532227)","(128, 128, 335)",PT,5.46875,5.46875,3.27012
520,MDA-160__PT.nii.gz,MDA,"(5.46875, 5.46875, 3.2699999809265137)","(128, 128, 127)",PT,5.46875,5.46875,3.27
521,CHUS-039__PT.nii.gz,CHUS,"(4.0, 4.0, 4.0)","(144, 144, 255)",PT,4.0,4.0,4.0
522,MDA-043__PT.nii.gz,MDA,"(5.46875, 5.46875, 3.2699999809265137)","(128, 128, 127)",PT,5.46875,5.46875,3.27
523,MDA-074__PT.nii.gz,MDA,"(4.296875, 4.296875, 4.25)","(128, 128, 259)",PT,4.296875,4.296875,4.25


In [169]:
spacing_info = images_df.groupby(by=['Institute', 'modality'])[['spacing_x', 'spacing_y', 'spacing_z']].mean()


In [170]:
image_count_info = images_df.groupby(by=['Institute', 'modality'])[['File Name']].count()


In [171]:
information = pd.concat([spacing_info, image_count_info], axis=1)
information

Unnamed: 0_level_0,Unnamed: 1_level_0,spacing_x,spacing_y,spacing_z,File Name
Institute,modality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CHUM,CT,0.987026,0.987026,1.974107,56
CHUM,PT,4.009766,4.009766,3.805447,56
CHUP,CT,0.976562,0.976562,1.506944,72
CHUP,PT,4.07283,4.07283,2.027,72
CHUS,CT,1.165093,1.165093,2.819444,72
CHUS,PT,4.0,4.0,4.0,72
CHUV,CT,1.249263,1.249263,3.033962,53
CHUV,PT,2.844929,2.844929,3.362453,53
HGJ,CT,0.976562,0.976562,3.27,55
HGJ,PT,3.536932,3.536932,3.270225,55


# Select Source and Target Datasets

In [175]:
source = images_df[images_df['Institute']=='MDA']

In [176]:
target1 =  images_df[images_df['Institute']=='CHUP']
target2 = images_df[images_df['Institute']=='CHUS']

In [177]:
source.head()

Unnamed: 0,File Name,Institute,spacing,size,modality,spacing_x,spacing_y,spacing_z
0,MDA-111__CT.nii.gz,MDA,"(0.9765625, 0.9765625, 5.0)","(512, 512, 195)",CT,0.976562,0.976562,5.0
2,MDA-126__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 127)",CT,0.976562,0.976562,3.27
3,MDA-005__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 267)",CT,0.976562,0.976562,3.27
5,MDA-032__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 267)",CT,0.976562,0.976562,3.27
6,MDA-173__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 91)",CT,0.976562,0.976562,3.27


In [196]:
source['label_name'] = source['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  source['label_name'] = source['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))


In [197]:
source.head()

Unnamed: 0,File Name,Institute,spacing,size,modality,spacing_x,spacing_y,spacing_z,label_name
0,MDA-111__CT.nii.gz,MDA,"(0.9765625, 0.9765625, 5.0)","(512, 512, 195)",CT,0.976562,0.976562,5.0,MDA-111.nii.gz
2,MDA-126__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 127)",CT,0.976562,0.976562,3.27,MDA-126.nii.gz
3,MDA-005__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 267)",CT,0.976562,0.976562,3.27,MDA-005.nii.gz
5,MDA-032__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 267)",CT,0.976562,0.976562,3.27,MDA-032.nii.gz
6,MDA-173__CT.nii.gz,MDA,"(0.9765620231628418, 0.9765620231628418, 3.269...","(512, 512, 91)",CT,0.976562,0.976562,3.27,MDA-173.nii.gz


In [198]:
source['label_name'].nunique()

198

In [199]:
source['File Name'].nunique()

396

In [215]:
target1['label_name'] = target1['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))
target2['label_name'] = target2['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target1['label_name'] = target1['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target2['label_name'] = target2['File Name'].apply(lambda x: x.replace('__CT', '').replace('__PT', ''))


### create CSV list of source and targets

In [200]:
# choose where to save the images 
original_folder = '../../datasets/ADDA/original'

In [203]:
source[['File Name', 'label_name']].to_csv(os.path.join(original_folder, 'source.csv'),
                           header=True, index=False)

In [216]:
target1[['File Name', 'label_name']].to_csv(os.path.join(original_folder, 'target1.csv'),
                           header=True, index=False)

In [217]:
target2[['File Name', 'label_name']].to_csv(os.path.join(original_folder, 'target2.csv'),
                           header=True, index=False)

In [204]:
source_list = pd.read_csv(os.path.join(original_folder, 'source.csv'))
source_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   File Name   396 non-null    object
 1   label_name  396 non-null    object
dtypes: object(2)
memory usage: 6.3+ KB


In [218]:
target1_list = pd.read_csv(os.path.join(original_folder, 'target1.csv'))
target1_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   File Name   144 non-null    object
 1   label_name  144 non-null    object
dtypes: object(2)
memory usage: 2.4+ KB


In [219]:
target2_list = pd.read_csv(os.path.join(original_folder, 'target2.csv'))
target2_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   File Name   144 non-null    object
 1   label_name  144 non-null    object
dtypes: object(2)
memory usage: 2.4+ KB


In [232]:
def copy_images(df, src_folder, trgt_folder, what):
    if what == 'images':
        col = 'File Name'
    elif what == 'labels':
        col = 'label_name'
    else:
        return 


    for img in df[col]:
        img_src = os.path.join(src_folder,img)
        img_trgt = os.path.join(trgt_folder+f'/{what}',img)

        shutil.copy(img_src, img_trgt)
    
    if what == 'labels':
        print(f'copied {df[col].nunique()} files')
    
    else:
        print(f'copied {df[col].shape[0]} files')

In [230]:
copy_images(source_list, root_image_folder, original_folder, 'images')

copied 396 files


In [233]:
copy_images(source_list, root_labels_folder, original_folder, 'labels')

copied 198 files


In [234]:
copy_images(target1_list, root_image_folder, original_folder, 'images')

copied 144 files


In [235]:
copy_images(target1_list, root_labels_folder, original_folder, 'labels')

copied 72 files


In [236]:
copy_images(target2_list, root_image_folder, original_folder, 'images')

copied 144 files


In [237]:
copy_images(target2_list, root_labels_folder, original_folder, 'labels')

copied 72 files


# Test and Train Split

In [264]:
src_lbl_names = source['label_name']

In [265]:
src_ct_names = source['label_name'].apply(lambda x: x.replace('.nii.gz', '__CT.nii.gz'))

In [266]:
src_pt_names = source['label_name'].apply(lambda x: x.replace('.nii.gz', '__PT.nii.gz'))

In [267]:
df = pd.DataFrame({'ct_image':src_ct_names,
                  'pt_image':src_pt_names,
                  'label':src_lbl_names})
df.head()

Unnamed: 0,ct_image,pt_image,label
0,MDA-111__CT.nii.gz,MDA-111__PT.nii.gz,MDA-111.nii.gz
2,MDA-126__CT.nii.gz,MDA-126__PT.nii.gz,MDA-126.nii.gz
3,MDA-005__CT.nii.gz,MDA-005__PT.nii.gz,MDA-005.nii.gz
5,MDA-032__CT.nii.gz,MDA-032__PT.nii.gz,MDA-032.nii.gz
6,MDA-173__CT.nii.gz,MDA-173__PT.nii.gz,MDA-173.nii.gz


In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 396 entries, 0 to 523
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ct_image  396 non-null    object
 1   pt_image  396 non-null    object
 2   label     396 non-null    object
dtypes: object(3)
memory usage: 12.4+ KB


In [278]:
df.drop_duplicates(inplace=True)

In [279]:
from sklearn.model_selection import train_test_split

In [281]:
# First, split into training (60%) and temporary (40%)
source_train, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Then, split the temporary set into validation (20%) and testing (20%)
source_valid, source_test = train_test_split(temp_df, test_size=0.5, random_state=42)


print('got: ')
print(f'{source_train.shape[0]} training images ')
print(f'{source_valid.shape[0]} validation images ')
print(f'{source_test.shape[0]} testing images ')

got: 
138 training images 
30 validation images 
30 testing images 


In [282]:
# save dataframe info
source_train.to_csv(os.path.join(original_folder, 'source_train.csv'), index=False)
source_valid.to_csv(os.path.join(original_folder, 'source_valid.csv'), index=False)
source_test.to_csv(os.path.join(original_folder, 'source_test.csv'), index=False)



### Target 1

In [293]:
trgt1_lbl_names = target1['label_name']

In [294]:
trgt1_ct_names = target1['label_name'].apply(lambda x: x.replace('.nii.gz', '__CT.nii.gz'))

In [295]:
trgt1_pt_names = target1['label_name'].apply(lambda x: x.replace('.nii.gz', '__PT.nii.gz'))

In [296]:
df = pd.DataFrame({'ct_image':trgt1_ct_names,
                  'pt_image':trgt1_pt_names,
                  'label':trgt1_lbl_names})
df.head()

Unnamed: 0,ct_image,pt_image,label
15,CHUP-028__CT.nii.gz,CHUP-028__PT.nii.gz,CHUP-028.nii.gz
20,CHUP-056__CT.nii.gz,CHUP-056__PT.nii.gz,CHUP-056.nii.gz
24,CHUP-061__CT.nii.gz,CHUP-061__PT.nii.gz,CHUP-061.nii.gz
29,CHUP-034__CT.nii.gz,CHUP-034__PT.nii.gz,CHUP-034.nii.gz
30,CHUP-003__CT.nii.gz,CHUP-003__PT.nii.gz,CHUP-003.nii.gz


In [297]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 15 to 512
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ct_image  144 non-null    object
 1   pt_image  144 non-null    object
 2   label     144 non-null    object
dtypes: object(3)
memory usage: 4.5+ KB


In [298]:
df.drop_duplicates(inplace=True)

In [299]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 15 to 522
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ct_image  72 non-null     object
 1   pt_image  72 non-null     object
 2   label     72 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB


In [300]:
# First, split into training (60%) and temporary (40%)
trgt1_train, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Then, split the temporary set into validation (20%) and testing (20%)
trgt1_valid, trgt1_test = train_test_split(temp_df, test_size=0.5, random_state=42)


print('got: ')
print(f'{trgt1_train.shape[0]} training images ')
print(f'{trgt1_valid.shape[0]} validation images ')
print(f'{trgt1_test.shape[0]} testing images ')

got: 
50 training images 
11 validation images 
11 testing images 


In [301]:
# save dataframe info
trgt1_train.to_csv(os.path.join(original_folder, 'target1(CHUP)_train.csv'), index=False)
trgt1_valid.to_csv(os.path.join(original_folder, 'target1(CHUP)_valid.csv'), index=False)
trgt1_test.to_csv(os.path.join(original_folder, 'target1(CHUP)_test.csv'), index=False)



### Target 2

In [302]:
trgt2_lbl_names = target2['label_name']

In [303]:
trgt2_ct_names = target2['label_name'].apply(lambda x: x.replace('.nii.gz', '__CT.nii.gz'))

In [304]:
trgt2_pt_names = target2['label_name'].apply(lambda x: x.replace('.nii.gz', '__PT.nii.gz'))

In [305]:
df = pd.DataFrame({'ct_image':trgt2_ct_names,
                  'pt_image':trgt2_pt_names,
                  'label':trgt2_lbl_names})
df.head()

Unnamed: 0,ct_image,pt_image,label
4,CHUS-048__CT.nii.gz,CHUS-048__PT.nii.gz,CHUS-048.nii.gz
13,CHUS-080__CT.nii.gz,CHUS-080__PT.nii.gz,CHUS-080.nii.gz
17,CHUS-036__CT.nii.gz,CHUS-036__PT.nii.gz,CHUS-036.nii.gz
48,CHUS-010__CT.nii.gz,CHUS-010__PT.nii.gz,CHUS-010.nii.gz
49,CHUS-027__CT.nii.gz,CHUS-027__PT.nii.gz,CHUS-027.nii.gz


In [306]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 4 to 521
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ct_image  144 non-null    object
 1   pt_image  144 non-null    object
 2   label     144 non-null    object
dtypes: object(3)
memory usage: 4.5+ KB


In [307]:
df.drop_duplicates(inplace=True)

In [308]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 4 to 510
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ct_image  72 non-null     object
 1   pt_image  72 non-null     object
 2   label     72 non-null     object
dtypes: object(3)
memory usage: 2.2+ KB


In [309]:
# First, split into training (60%) and temporary (40%)
trgt2_train, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Then, split the temporary set into validation (20%) and testing (20%)
trgt2_valid, trgt2_test = train_test_split(temp_df, test_size=0.5, random_state=42)


print('got: ')
print(f'{trgt2_train.shape[0]} training images ')
print(f'{trgt2_valid.shape[0]} validation images ')
print(f'{trgt2_test.shape[0]} testing images ')

got: 
50 training images 
11 validation images 
11 testing images 


In [310]:
# save dataframe info
trgt2_train.to_csv(os.path.join(original_folder, 'target2(CHUS)_train.csv'), index=False)
trgt2_valid.to_csv(os.path.join(original_folder, 'target2(CHUS)_valid.csv'), index=False)
trgt2_test.to_csv(os.path.join(original_folder, 'target2(CHUS)_test.csv'), index=False)

