In [52]:
import os
from PIL import Image
import pandas as pd
import tifffile
import numpy as np

In [13]:

def scan_tiff_stacks(root_dir):
    images_dir = os.path.join(root_dir, 'images')
    # print(images_dir)
    image_files = [f for f in os.listdir(images_dir) if f.endswith('.tiff') or f.endswith('.tif')]
    # print(image_files)
    data = []

    for image_name in image_files:
        image_path = os.path.join(images_dir, image_name)
        with tifffile.TiffFile(image_path) as tif:
            # Assuming the first page/series represents the dimensions of interest
            series = tif.series[0]  # This might need adjustment based on your specific files
            # print(series.shape)
            width = series.shape[0]
            height = series.shape[1]
            depth = series.shape[2]
            
        # contains_ct_or_pt = 'CT' in image_name or 'PT' in image_name
        if 'CT' in image_name:
            type = 'CT'
        elif 'PT' in image_name:
            type = 'PT'
        else:
            type = 'other'
            
        data.append({
            'image_name': image_name,
            'width': width,
            'height':height,
            'depth': depth,
            'image_type': type
        })

    df = pd.DataFrame(data)
    return df

In [14]:
# Replace 'root_directory_path' with the actual path to your root directory
root_directory_path = '../../datasets/ADDA/tiff/'
df_images = scan_tiff_stacks(root_directory_path)
# print(df_images)

In [19]:
df_images.head()

Unnamed: 0,image_name,width,height,depth,image_type
0,MDA-004__PT.tiff,500,500,100,PT
1,MDA-005__PT.tiff,500,500,100,PT
2,CHUP-030__PT.tiff,500,500,100,PT
3,MDA-190__CT.tiff,500,500,100,CT
4,MDA-191__CT.tiff,500,500,100,CT


In [30]:
irregular = df_images[df_images['width']!=500]
irregular.sort_values('image_name')

Unnamed: 0,image_name,width,height,depth,image_type
47,MDA-001__CT.tiff,498,498,100,CT
445,MDA-001__PT.tiff,498,498,100,PT
459,MDA-031__CT.tiff,350,350,100,CT
61,MDA-031__PT.tiff,350,350,100,PT
230,MDA-032__CT.tiff,498,498,100,CT
372,MDA-032__PT.tiff,498,498,100,PT
229,MDA-033__CT.tiff,498,498,100,CT
371,MDA-033__PT.tiff,498,498,100,PT
139,MDA-052__CT.tiff,698,698,100,CT
286,MDA-052__PT.tiff,698,698,100,PT


In [26]:
irregular[irregular['image_name'].str.contains('MDA')].shape

(28, 5)

In [27]:
df_images[df_images['image_name'].str.contains('MDA')].shape

(396, 5)

In [35]:
good_images = df_images[~df_images['image_name'].isin(irregular['image_name'])]
good_images.describe()

Unnamed: 0,width,height,depth
count,512.0,512.0,512.0
mean,500.0,500.0,100.0
std,0.0,0.0,0.0
min,500.0,500.0,100.0
25%,500.0,500.0,100.0
50%,500.0,500.0,100.0
75%,500.0,500.0,100.0
max,500.0,500.0,100.0


In [41]:
good_images['volume'] = good_images['image_name'].apply(lambda x: x.split('__')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  good_images['volume'] = good_images['image_name'].apply(lambda x: x.split('__')[0])


In [42]:
good_images

Unnamed: 0,image_name,width,height,depth,image_type,volume
0,MDA-004__PT.tiff,500,500,100,PT,MDA-004
1,MDA-005__PT.tiff,500,500,100,PT,MDA-005
2,CHUP-030__PT.tiff,500,500,100,PT,CHUP-030
3,MDA-190__CT.tiff,500,500,100,CT,MDA-190
4,MDA-191__CT.tiff,500,500,100,CT,MDA-191
...,...,...,...,...,...,...
535,MDA-026__CT.tiff,500,500,100,CT,MDA-026
536,CHUP-012__CT.tiff,500,500,100,CT,CHUP-012
537,CHUP-013__CT.tiff,500,500,100,CT,CHUP-013
538,MDA-121__PT.tiff,500,500,100,PT,MDA-121


In [54]:
good_images['dataset'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  good_images['dataset'] = None


In [55]:
good_images.loc[good_images['volume'].str.contains('MDA'), 'dataset'] = 'source'
good_images.loc[good_images['volume'].str.contains('CHUP'), 'dataset'] = 'target'

In [58]:
good_images.sort_values('volume').head()

Unnamed: 0,image_name,width,height,depth,image_type,volume,dataset
430,CHUP-000__PT.tiff,500,500,100,PT,CHUP-000,target
16,CHUP-000__CT.tiff,500,500,100,CT,CHUP-000,target
429,CHUP-001__PT.tiff,500,500,100,PT,CHUP-001,target
15,CHUP-001__CT.tiff,500,500,100,CT,CHUP-001,target
266,CHUP-002__PT.tiff,500,500,100,PT,CHUP-002,target


In [57]:
good_images['dataset'].value_counts()

dataset
source    368
target    144
Name: count, dtype: int64

In [59]:
good_images.to_csv('../dataset/good.csv', index=False)

### split file

In [99]:
img_splits = pd.read_csv('../dataset/mapped_images_non_stratified.csv')

In [100]:
img_splits[img_splits['subject_id']=='CHUP-073.tiff']

Unnamed: 0,subject_id,image_path,image_modality,set
385,CHUP-073.tiff,CHUP-073__PT.tiff,PT,train
511,CHUP-073.tiff,CHUP-073__CT.tiff,CT,test


In [67]:
img_splits

Unnamed: 0,subject_id,image_path,image_modality,set
0,MDA-020.tiff,MDA-020__PT.tiff,PT,train
1,MDA-054.tiff,MDA-054__CT.tiff,CT,train
2,MDA-201.tiff,MDA-201__CT.tiff,CT,train
3,MDA-184.tiff,MDA-184__CT.tiff,CT,train
4,MDA-065.tiff,MDA-065__PT.tiff,PT,train
...,...,...,...,...
507,CHUP-044.tiff,CHUP-044__CT.tiff,CT,test
508,CHUP-026.tiff,CHUP-026__CT.tiff,CT,test
509,CHUP-068.tiff,CHUP-068__CT.tiff,CT,test
510,CHUP-042.tiff,CHUP-042__CT.tiff,CT,test


In [68]:
img_splits['dataset'] = None

In [69]:
img_splits.loc[img_splits['subject_id'].str.contains('MDA'), 'dataset'] = 'source'
img_splits.loc[img_splits['subject_id'].str.contains('CHUP'), 'dataset'] = 'target'

In [70]:
img_splits

Unnamed: 0,subject_id,image_path,image_modality,set,dataset
0,MDA-020.tiff,MDA-020__PT.tiff,PT,train,source
1,MDA-054.tiff,MDA-054__CT.tiff,CT,train,source
2,MDA-201.tiff,MDA-201__CT.tiff,CT,train,source
3,MDA-184.tiff,MDA-184__CT.tiff,CT,train,source
4,MDA-065.tiff,MDA-065__PT.tiff,PT,train,source
...,...,...,...,...,...
507,CHUP-044.tiff,CHUP-044__CT.tiff,CT,test,target
508,CHUP-026.tiff,CHUP-026__CT.tiff,CT,test,target
509,CHUP-068.tiff,CHUP-068__CT.tiff,CT,test,target
510,CHUP-042.tiff,CHUP-042__CT.tiff,CT,test,target


In [74]:
img_splits[['dataset', 'set']].value_counts().sort_index()

dataset  set  
source   test      56
         train    256
         val       56
target   test      22
         train    100
         val       22
Name: count, dtype: int64

In [98]:
img_splits[img_splits['subject_id']=='CHUP-073.tiff']

Unnamed: 0,subject_id,image_path,image_modality,set,dataset,ct_path,pet_path,label_path
385,CHUP-073.tiff,CHUP-073__PT.tiff,PT,train,target,../datasets/ADDA/tiff/images/CHUP-073__CT.tiff,../datasets/ADDA/tiff/images/CHUP-073__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-073.tiff
511,CHUP-073.tiff,CHUP-073__CT.tiff,CT,test,target,../datasets/ADDA/tiff/images/CHUP-073__CT.tiff,../datasets/ADDA/tiff/images/CHUP-073__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-073.tiff


## Create training CSV

In [81]:
# add route path 
img_splits['ct_path'] = img_splits['subject_id'].apply(lambda x: '../datasets/ADDA/tiff/images/'+x.split('.')[0]+'__CT.tiff')
img_splits['pet_path'] = img_splits['subject_id'].apply(lambda x: '../datasets/ADDA/tiff/images/'+x.split('.')[0]+'__PT.tiff')
img_splits['label_path'] = img_splits['subject_id'].apply(lambda x: '../datasets/ADDA/tiff/labels/'+x)

In [84]:
img_splits.sort_values('subject_id')

Unnamed: 0,subject_id,image_path,image_modality,set,dataset,ct_path,pet_path,label_path
390,CHUP-000.tiff,CHUP-000__PT.tiff,PT,train,target,../datasets/ADDA/tiff/images/CHUP-000__CT.tiff,../datasets/ADDA/tiff/images/CHUP-000__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-000.tiff
484,CHUP-000.tiff,CHUP-000__CT.tiff,CT,val,target,../datasets/ADDA/tiff/images/CHUP-000__CT.tiff,../datasets/ADDA/tiff/images/CHUP-000__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-000.tiff
463,CHUP-001.tiff,CHUP-001__CT.tiff,CT,train,target,../datasets/ADDA/tiff/images/CHUP-001__CT.tiff,../datasets/ADDA/tiff/images/CHUP-001__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-001.tiff
406,CHUP-001.tiff,CHUP-001__PT.tiff,PT,train,target,../datasets/ADDA/tiff/images/CHUP-001__CT.tiff,../datasets/ADDA/tiff/images/CHUP-001__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-001.tiff
461,CHUP-002.tiff,CHUP-002__PT.tiff,PT,train,target,../datasets/ADDA/tiff/images/CHUP-002__CT.tiff,../datasets/ADDA/tiff/images/CHUP-002__PT.tiff,../datasets/ADDA/tiff/labels/CHUP-002.tiff
...,...,...,...,...,...,...,...,...
213,MDA-199.tiff,MDA-199__PT.tiff,PT,train,source,../datasets/ADDA/tiff/images/MDA-199__CT.tiff,../datasets/ADDA/tiff/images/MDA-199__PT.tiff,../datasets/ADDA/tiff/labels/MDA-199.tiff
80,MDA-200.tiff,MDA-200__PT.tiff,PT,train,source,../datasets/ADDA/tiff/images/MDA-200__CT.tiff,../datasets/ADDA/tiff/images/MDA-200__PT.tiff,../datasets/ADDA/tiff/labels/MDA-200.tiff
176,MDA-200.tiff,MDA-200__CT.tiff,CT,train,source,../datasets/ADDA/tiff/images/MDA-200__CT.tiff,../datasets/ADDA/tiff/images/MDA-200__PT.tiff,../datasets/ADDA/tiff/labels/MDA-200.tiff
288,MDA-201.tiff,MDA-201__PT.tiff,PT,val,source,../datasets/ADDA/tiff/images/MDA-201__CT.tiff,../datasets/ADDA/tiff/images/MDA-201__PT.tiff,../datasets/ADDA/tiff/labels/MDA-201.tiff


In [85]:
img_splits.columns

Index(['subject_id', 'image_path', 'image_modality', 'set', 'dataset',
       'ct_path', 'pet_path', 'label_path'],
      dtype='object')

In [89]:
img_splits[['subject_id', 
            'dataset',
            'set', 
            'ct_path', 
            'pet_path', 
            'label_path']].duplicated().value_counts()

False    388
True     124
Name: count, dtype: int64

In [91]:
training_csv = img_splits[['subject_id', 
            'dataset',
            'set', 
            'ct_path', 
            'pet_path', 
            'label_path']]

In [94]:
training_csv = training_csv[~training_csv['subject_id'].duplicated()]

In [95]:
training_csv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 0 to 499
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  256 non-null    object
 1   dataset     256 non-null    object
 2   set         256 non-null    object
 3   ct_path     256 non-null    object
 4   pet_path    256 non-null    object
 5   label_path  256 non-null    object
dtypes: object(6)
memory usage: 14.0+ KB


In [97]:
training_csv[['dataset', 'set']].value_counts().sort_index()

dataset  set  
source   test       4
         train    172
         val        8
target   test       2
         train     69
         val        1
Name: count, dtype: int64

In [101]:
training_csv[['dataset']].value_counts()

dataset
source     184
target      72
Name: count, dtype: int64

In [105]:
# split train test 
training_csv = training_csv[['subject_id', 'dataset', 'ct_path', 'pet_path', 'label_path']]

In [109]:
training_csv.to_csv('../dataset/training_csv.csv', index=False)

In [110]:
source_csv = training_csv[training_csv['dataset']=='source']
target_csv = training_csv[training_csv['dataset']=='target']

print(source_csv.shape)
print(target_csv.shape)

(184, 5)
(72, 5)


In [111]:
training_csv.head()

Unnamed: 0,subject_id,dataset,ct_path,pet_path,label_path
0,MDA-020.tiff,source,../datasets/ADDA/tiff/images/MDA-020__CT.tiff,../datasets/ADDA/tiff/images/MDA-020__PT.tiff,../datasets/ADDA/tiff/labels/MDA-020.tiff
1,MDA-054.tiff,source,../datasets/ADDA/tiff/images/MDA-054__CT.tiff,../datasets/ADDA/tiff/images/MDA-054__PT.tiff,../datasets/ADDA/tiff/labels/MDA-054.tiff
2,MDA-201.tiff,source,../datasets/ADDA/tiff/images/MDA-201__CT.tiff,../datasets/ADDA/tiff/images/MDA-201__PT.tiff,../datasets/ADDA/tiff/labels/MDA-201.tiff
3,MDA-184.tiff,source,../datasets/ADDA/tiff/images/MDA-184__CT.tiff,../datasets/ADDA/tiff/images/MDA-184__PT.tiff,../datasets/ADDA/tiff/labels/MDA-184.tiff
4,MDA-065.tiff,source,../datasets/ADDA/tiff/images/MDA-065__CT.tiff,../datasets/ADDA/tiff/images/MDA-065__PT.tiff,../datasets/ADDA/tiff/labels/MDA-065.tiff


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming training_csv is your DataFrame containing the data

# Shuffle the DataFrame
training_csv_shuffled = training_csv.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the shuffled DataFrame into train, validation, and test sets
train_df, temp_df = train_test_split(training_csv_shuffled, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Add a column to indicate the split
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

# Concatenate the split DataFrames back together
final_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# Print the distribution of splits
print(final_df['split'].value_counts())

# Now final_df contains the original data split into train, validation, and test sets with a column indicating the split
