## Setup

### Imports

In [1]:
# imports
import os
from os.path import isdir, join
import shutil
import re
import glob
import pandas as pd
from tqdm import tqdm
from pypdf import PdfReader
from src.utils import *

### Settings

In [2]:
pd.set_option('display.max_colwidth', 100)

### Restructure tcga data folders by case

In [5]:
%%script false --no-raise-error
# define the base directory
base_dir = '/mnt/disks/ext/data/gdc/tcga/brca/'
# Define the pattern for the case id
pattern = r"TCGA-\w{2}-\w{4}"
# Iterate over all directories in the base directory
for dirpath, dirnames, filenames in os.walk(base_dir):
    for filename in filenames:
        # Find the case id in the filename
        match = re.search(pattern, filename)
        if match:
            case_id = match.group()
            # Create a new directory for this case id, if it doesn't exist
            new_dir = os.path.join(base_dir, case_id)
            os.makedirs(new_dir, exist_ok=True)
            # Move the file to the new directory
            shutil.move(os.path.join(dirpath, filename), os.path.join(new_dir, filename))

# move other folders/files, except for the case folders, to misc folder
# Define the pattern for the case id and the 8-digit alphanumeric
case_pattern = r"TCGA-\w{2}-\w{4}"
misc_pattern = r"^[a-z0-9]{8}-"

# Create the 'misc' directory if it doesn't exist
misc_dir = os.path.join(base_dir, 'misc')
os.makedirs(misc_dir, exist_ok=True)

# Create a list to store directories to be moved
dirs_to_move = []

# Generate a list of all directories in the base directory
all_dirs = [x[0] for x in os.walk(base_dir) if not x[0].startswith(misc_dir)]

# Iterate over all directories in the list
for dirpath in all_dirs:
    # If the directory is empty, remove it
    if dirpath != base_dir and not any(os.scandir(dirpath)):
        os.rmdir(dirpath)
    else:
        # If the directory name starts with an 8-digit alphanumeric followed by a hyphen
        # and it's not a case directory, add it to the list of directories to be moved
        dirname = os.path.basename(dirpath)
        if re.match(misc_pattern, dirname) and not re.match(case_pattern, dirname):
            dirs_to_move.append(dirpath)

# Move the directories in the list to the 'misc' directory
for dirpath in dirs_to_move:
    if os.path.exists(dirpath):  # Check if the directory still exists
        dest_dir = os.path.join(misc_dir, os.path.basename(dirpath))
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir)
        shutil.move(dirpath, dest_dir)
# count # cases in base dir
case_count = 0

# Iterate over all directories in the base directory
for dirpath, dirnames, _ in os.walk(base_dir):
    # If the directory name matches the case id pattern, increment the counter
    dirname = os.path.basename(dirpath)
    if re.match(case_pattern, dirname):
        case_count += 1

print(f"total number of case directories: {case_count}")

### Convert pdf to text


In [6]:
%%script false --no-raise-error
# Call the function
extract_text_from_pdf('/mnt/disks/ext/data/gdc/tcga/brca/')

### Create manifest file for feature extraction

In [23]:
# %%script false --no-raise-error
# set data dir
data_dir = '/mnt/disks/ext/data/gdc/tcga/brca/'
# list all case folders in data dir (i.e. all folders that start with 'TCGA-')
case_dirs = [f for f in os.listdir(data_dir) if isdir(join(data_dir, f)) and f.startswith('TCGA-')]
# get list of all .svs files in each case folder
all_wsis = []
for case_dir in case_dirs:
    all_wsis.extend([f for f in os.listdir(join(data_dir, case_dir)) if f.endswith('.svs')])
print(f'total # of wsis: {len(all_wsis)}')
# remove '.svs' from file names
all_wsis = [f.replace('.svs', '') for f in all_wsis]
 
stils_tcga_elg_path = 'data/stils/stils_tcga_ellogon.tsv'
wsis_stils_elg = pd.read_csv(stils_tcga_elg_path, sep='\t')['wsi_id'].tolist()
print(f'# annotations: {len(wsis_stils_elg)}')

# get list of files in manifest that are in data dir
annotated_wsis = [f for f in wsis_stils_elg if f in all_wsis]
print(f'# annotated wsis: {len(annotated_wsis)}')

# Initialize the directories and files
data_dir = '/mnt/disks/ext/data/gdc/tcga/brca'
# reference_file = '/home/neil/multimodal/data/stils_tcga_brca_annotated.txt'
wsi_stils_feats_manifest_path = 'data/stils/wsi_stils_feats_manifest.txt'

# Initialize the list for the manifest
wsi_stils_annot_paths = []

# Loop through all case folders in data_dir
case_ids_annot = []
for root, dirs, files in os.walk(data_dir):
    # Filter for .svs files
    all_wsi_paths = [file for file in files if file.endswith('.svs')]
    if all_wsi_paths:
        # Check if any of the .svs files are in the reference file
        for wsi_path in all_wsi_paths:
            if wsi_path.replace('.svs', '') in annotated_wsis:
                wsi_stils_annot_paths.append(os.path.join(os.path.basename(root), wsi_path))
                case_ids_annot.append(os.path.basename(root))
                break
print(f'# annotated cases: {len(set(case_ids_annot))}')

# Save the list of files to the manifest path
print(f'# annotated wsis in manifest: {len(wsi_stils_annot_paths)}')
# with open(wsi_stils_feats_manifest_path, 'w') as f:
#     for item in wsi_stils_annot_paths:
#         f.write("%s\n" % item)

total # of wsis: 3110
# annotations: 700
# annotated wsis: 700
# annotated cases: 688
# annotated wsis in manifest: 688


### Copy feature embs to/from case folder

In [33]:
%%script false --no-raise-error
# Copy feature embs from WSIs & reports to case folders
# Directory where the feature embeddings are stored
wsi_feats_dir = "data/wsi_feats"
report_feats_dir = "data/report_feats"

# Directory where the case folders are located
dst_dir = "/mnt/disks/ext/data/gdc/tcga/brca"

# Loop over all img feat files in the source directory
for src_file in glob.glob(os.path.join(wsi_feats_dir, "TCGA-*.pt")):
    # Extract the base name of the file
    base_name = os.path.basename(src_file)

    # Construct the destination directory path
    dst_file = os.path.join(dst_dir, base_name[:12], base_name)

    # Copy the file if it doesn't already exist
    if not os.path.exists(dst_file):
        shutil.copy(src_file, dst_file)
    
# Loop over all text feat files in the source directory
for src_file in glob.glob(os.path.join(report_feats_dir, "TCGA-*.pt")):
    # Extract the base name of the file
    base_name = os.path.basename(src_file)

    # Construct the destination directory path
    dst_file = os.path.join(dst_dir, base_name[:12], base_name)

    # Copy the file if it doesn't already exist
    if not os.path.exists(dst_file):
        shutil.copy(src_file, dst_file)

In [24]:
%%script false --no-raise-error
# Copy feature embs from WSIs & reports from case folders to separate folders
# Directory where the case folders are located
src_dir = "/mnt/disks/ext/data/gdc/tcga/brca"

# Directory where the feature embeddings will be stored
wsi_feats_dir = "data/wsi_feats"
report_feats_dir = "data/report_feats"

# Create the directories if they don't exist
os.makedirs(wsi_feats_dir, exist_ok=True)
os.makedirs(report_feats_dir, exist_ok=True)

# Loop over all case folders in the source directory
for case_folder in glob.glob(os.path.join(src_dir, "TCGA-*")):
    # Loop over all img feat files in the case folder
    for src_file in glob.glob(os.path.join(case_folder, "*.wsi.pt")):
        # Extract the base name of the file
        base_name = os.path.basename(src_file)

        # Construct the destination file path
        dst_file = os.path.join(wsi_feats_dir, base_name)

        # Copy the file
        shutil.copy(src_file, dst_file)

    # Loop over all text feat files in the case folder
    for src_file in glob.glob(os.path.join(case_folder, "*.report.pt")):
        # Extract the base name of the file
        base_name = os.path.basename(src_file)

        # Construct the destination file path
        dst_file = os.path.join(report_feats_dir, base_name)

        # Copy the file
        shutil.copy(src_file, dst_file) 

### Create .csv file for loading dataset

In [None]:
# Load the annotations
wsi_stils_annot = pd.read_csv('data/stils/stils_tcga_ellogon.tsv', sep='\t')

# Set data dirs
wsi_feats_dir = 'data/wsi_feats'
report_feats_dir = 'data/report_feats'

# Initialize the dataset
df = []

# Define the pattern for the case id
case_pattern = r"TCGA-\w{2}-\w{4}"

# Walk through the wsi_feats_dir
for root, dirs, files in os.walk(wsi_feats_dir):
    for wsi_feat_file in files:
        # Check if the file is a feature file
        if wsi_feat_file.endswith('.wsi.pt'):
            # Extract the case id and slide id
            case_id = wsi_feat_file.split('.wsi.pt')[0][:12]
            slide_id = wsi_feat_file.split('.wsi.pt')[0]
            
            # print(f'case_id: {case_id}, slide_id: {slide_id}')
            
            # Find the matching row in the annotations
            annot = wsi_stils_annot[wsi_stils_annot['wsi_id'] == slide_id]
            split, stil_score = annot['split'].values[0], annot['stil_score'].values[0] if not annot.empty else (None, None)
            
            # Find the report file
            report_feat_file = next((f for f in os.listdir(report_feats_dir) if f.startswith(case_id) and f.endswith('.report.pt')), None)
            if report_feat_file is not None:
                report_feat_path = os.path.join(report_feats_dir, report_feat_file)
                wsi_feat_path = os.path.join(wsi_feats_dir, wsi_feat_file)
                # Add the data to the dataset
                df.append([case_id, wsi_feat_path, report_feat_path, split, stil_score])


# Convert the dataset to a DataFrame and save it to a CSV file
df = pd.DataFrame(df, columns=['case_id', 'wsi_feat_path', 'report_feat_path', 'split', 'stil_score'])

# drop rows w/ no sTIL score
df.dropna(subset=['stil_score'], inplace=True)

# Replace 'Training' with 'train' and 'Test' with 'test' in the 'set' column
# df['set'] = df['set'].replace({'Training': 'train', 'Test': 'test', 'Validation': 'val'})

# bucketize sTIL scores
df['stil_lvl'] = df['stil_score'].apply(lambda x: int(x // 0.1))

df.head()

In [31]:
# save dataset to csv
# dataset_path = os.path.join(data_dir, 'dataset.csv')
data_path = 'data/stils/data_stils.csv'
df.to_csv(data_path, index=False)

# dataset.head(20)
# count # of cases w sTIL scores & set labels
# dataset = pd.read_csv('tcga/brca/dataset.csv')
print(f"number of cases w sTIL scores: {df['stil_score'].count()} / {len(df)}")
print(f"number of cases w split labels: {df['split'].count()} / {len(df)}")

number of cases w sTIL scores: 684 / 684
number of cases w split labels: 684 / 684
