In [2]:
# %%capture
# !pip install git+https://github.com/bowang-lab/MedSAM.git

# !pip install idc-index --upgrade
# !pip install ipywidgets
# !pip install itk
# !pip install wsidicom
# !pip install pydicom

In [None]:
from google.cloud import storage
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
import pandas as pd
from google.cloud import bigquery
from io import BytesIO
from PIL import Image
import numpy as np
from idc_index import index
import ipywidgets as widgets
from IPython.display import display
import io
import requests

class TCGAImageRNAIntegrator:
    def __init__(self):
        # Load RNA-Sequencing data
        # self.rna_seq_data = pd.read_csv(rna_seq_data_path)
        # Initialize BigQuery client
        self.client = bigquery.Client()
        # Initialize Google Cloud Storage client
        self.storage_client = storage.Client()

        # Initialize IDC client and collection picker
        self.idc_client = index.IDCClient()
        self.collection_picker = None
        self.selected_collection = None

    def select_collection(self):
        # Setup a dropdown widget to select a collection
        collections = self.idc_client.index[self.idc_client.index['Modality'] == 'SM']['collection_id'].unique()
        self.collection_picker = widgets.Dropdown(options=collections)
        display(self.collection_picker)
        self.collection_picker.observe(self.on_collection_selected, names='value')

    def on_collection_selected(self, change):
        # Handle collection selection
        self.selected_collection = change['new']
        print(f"Selected collection: {self.selected_collection}")

    def fetch_image_data(self, case_id):
        # Query to fetch image data based on case_id and selected collection
        if not self.selected_collection:
            raise ValueError("No collection selected. Please select a collection using the `select_collection` method.")

        query = f"""
        WITH 
        img_data AS (
          SELECT PatientID, idc_case_id, collection_id, collection_name, collection_cancerType, Modality, collection_tumorLocation, gcs_url  
          FROM `bigquery-public-data.idc_current.dicom_all_view`
          WHERE collection_id = '{self.selected_collection}'
        ),
        gdc_data AS (
          SELECT dicom_patient_id, case_gdc_id 
          FROM `bigquery-public-data.idc_current_clinical.tcga_kich_clinical`
        )

        SELECT 
          img_data.PatientID,
          img_data.idc_case_id,
          img_data.collection_id,
          img_data.collection_name,
          img_data.collection_cancerType,
          img_data.Modality,
          img_data.collection_tumorLocation,
          img_data.gcs_url,
          gdc_data.case_gdc_id
        FROM 
          img_data
        JOIN 
          gdc_data
        ON 
          img_data.PatientID = gdc_data.dicom_patient_id
        WHERE 
          gdc_data.case_gdc_id = '{case_id}';
        """
        # Run the query
        image_data = self.client.query(query).to_dataframe()
        return image_data

    def download_dicom_from_gcs(self, gcs_url):
        # Extract the bucket name and the file path from the GCS URL
        bucket_name = gcs_url.split('/')[2]
        file_path = '/'.join(gcs_url.split('/')[3:])

        # Download the file from GCS
        bucket = self.storage_client.bucket(bucket_name)
        blob = bucket.blob(file_path)
        dicom_file = BytesIO()
        blob.download_to_file(dicom_file)
        dicom_file.seek(0)  # Move to the beginning of the file
        return dicom_file

    def view_dicom_image(self, gcs_url):
        # Download the DICOM file using the Google Cloud Storage client
        dicom_file = self.download_dicom_from_gcs(gcs_url)

        # Read the DICOM file using pydicom
        dicom_data = pydicom.dcmread(dicom_file)

        # Apply VOI LUT (if available) to convert pixel data to human-visible format
        image = apply_voi_lut(dicom_data.pixel_array, dicom_data)

        # Normalize the image (the pixel values might be in a wide range, so normalize to [0, 1])
        image = image.astype(float)
        image = (image - image.min()) / (image.max() - image.min())

        # Display the image using matplotlib
        plt.imshow(image, cmap=plt.cm.gray)
        plt.title(f"DICOM Image from {gcs_url}")
        plt.axis('off')
        plt.show()

    def process_image(self, gcs_url):
        # Download the DICOM file using the Google Cloud Storage client
        dicom_file = self.download_dicom_from_gcs(gcs_url)

        # Read the DICOM file using pydicom
        dicom_data = pydicom.dcmread(dicom_file)

        # Apply VOI LUT (if available) to convert pixel data to human-visible format
        image = apply_voi_lut(dicom_data.pixel_array, dicom_data)

        # Normalize the image (the pixel values might be in a wide range, so normalize to [0, 1])
        image = image.astype(float)
        image = (image - image.min()) / (image.max() - image.min())
        
        # Convert the image to PIL Image format
        image = Image.fromarray((image * 255).astype(np.uint8))

        # Example feature extraction (mean intensity)
        mean_intensity = np.mean(image)

        features = {
            'mean_intensity': mean_intensity
            # Add more features as needed
        }
        
        return features

    def integrate_image_features(self, case_id, features):
        # Integrate the image features into the RNA-Seq data
        for feature_name, feature_value in features.items():
            self.rna_seq_data.loc[self.rna_seq_data['Case ID'] == case_id, feature_name] = feature_value

    def process_and_merge(self):
        # Iterate over each case_id in the RNA-Seq data
        for case_id in self.rna_seq_data['Case ID'].unique():
            image_data = self.fetch_image_data(case_id)
            if not image_data.empty:
                for _, row in image_data.iterrows():
                    features = self.process_image(row['gcs_url'])
                    self.integrate_image_features(case_id, features)

    def get_merged_data(self):
        return self.rna_seq_data

In [None]:
tcga_kich_data = pd.read_csv('./personal_docs/data/idc_data/tcga_kich.csv')
rna_seq_DGE_data[rna_seq_DGE_data['case_id'].isin(tcga_kich_data['case_gdc_id'])].to_csv('./personal_docs/data/ml_data/rna_seq_DGE_data_tcga_kich_idc_matching.csv')

In [2]:
tcga_image_extractor = TCGAImageRNAIntegrator()

In [3]:
tcga_image_extractor.select_collection()

Dropdown(options=('tcga_thym', 'tcga_skcm', 'tcga_brca', 'tcga_tgct', 'htan_vanderbilt', 'tcga_coad', 'htan_wu…

Selected collection: tcga_kich


In [4]:
image_data = tcga_image_extractor.fetch_image_data(case_id='02979422-5149-4750-ad5f-483e0bec6ac5')

I0000 00:00:1725178842.483070 3827620 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1725178842.497308 3827620 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [11]:
gcs_url = image_data['gcs_url'].to_numpy()[0] 

In [12]:
# Convert the gs type URL to an https type url
https_url = gcs_url.replace("gs://", "https://storage.googleapis.com/")

# Now we can get the DICOM object
response = requests.get(https_url)

# Check that there wasn't an error with the request
if response.status_code != 200:
    # Print the error code and message if something went wrong
    print('Request failed: {}'.format(response.reason))
    

dcm = io.BytesIO(response.content)

In [5]:
print(pydicom.dcmread(dcm))

KeyboardInterrupt: 