In [1]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
import json

## Collect the conversion relation between bcr_patient_uuid and bcr_sample_barcode

In [2]:
uuid2barcode = {} # store the conversion in dict
path = "../data/biomed_clinic_data/01_csv_data"
all_files = glob.glob(os.path.join(path, "*.csv"))
for f in all_files:
    df = pd.read_csv(f, sep=",")
    if "bcr_patient_uuid" in df and "bcr_patient_barcode" in df:
        for i in range(2, df.shape[0]):
            if df["bcr_patient_uuid"][i] not in uuid2barcode:
                uuid2barcode[df["bcr_patient_uuid"][i]] = (
                    df["bcr_patient_barcode"][i] + "-01"
                )

In [3]:
# store the conversion if wanted
with open("uuid2barcode.json", "w") as file:
    json.dump(uuid2barcode, file)

## Merge the biomed-clinical and multi-omics data using bcr_sample_barcode as index

In [4]:
def merge_omics_biomed(omic_df, biomed_df, uuid2barcode, merged_name):
    (num_patients, num_features) = omic_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in omics data"
    )

    (num_patients, num_features) = biomed_df.shape
    print(
        f"Number of features: {num_features} \t Number of patients: {num_patients} in biomed data"
    )

    # move "RECURRENCE" to the last column
    temp_cols = biomed_df.columns.tolist()
    index = biomed_df.columns.get_loc("RECURRENCE")
    new_cols = (
        temp_cols[0:index] + temp_cols[index + 1 :] + temp_cols[index : index + 1]
    )
    biomed_df = biomed_df[new_cols]

    # change uuid to barcode
    biomed_df["bcr_patient_barcode"] = "TEST"
    for i in range(biomed_df.shape[0]):
        if biomed_df["bcr_patient_uuid"][i] in uuid2barcode:
            biomed_df["bcr_patient_barcode"][i] = uuid2barcode[
                biomed_df["bcr_patient_uuid"][i]
            ]
        else:
            biomed_df["bcr_patient_barcode"][i] = None
    biomed_df.drop(columns=["bcr_patient_uuid"], inplace=True)

    # Temporary only take the "RECURRENCE" data
    target_df = biomed_df
    target_df = target_df.drop_duplicates()
    target_df = target_df.set_index("bcr_patient_barcode")

    merged_df = pd.merge(omic_df, target_df, left_index=True, right_index=True)

    merged_df.to_csv("./{}".format(merged_name))
    
    return merged_df

In [9]:
# Load multi-omics data into dataframe
omics = "cnv_methyl_mrna"
omic_df = pd.read_csv(
        "../data/omics_data/1_csv_data/{}.csv".format(omics), index_col=0
    ).T
omic_df = omic_df.astype("float32")

# Load biomed-clinical data into dataframe
biomed = "85_features_w_barcodes"
biomed_df = (
    pd.read_csv("../data/biomed_clinic_data/02_combined_data/{}.csv".format(biomed), index_col=0,)
    .drop_duplicates()
    .drop(columns=["bcr_sample_barcode"])
    .reset_index()
)

# Load the conversion between uuid and barcode
with open("uuid2barcode.json", "r") as file:
    uuid2barcode = json.load(file)
    
# merge multi-omics and biomed-clinical data
merged_name = "{}_biomed_clinical_85_features.csv".format(omics)
merged_df = merge_omics_biomed(omic_df, biomed_df, uuid2barcode, merged_name)

Number of features: 58512 	 Number of patients: 555 in omics data
Number of features: 85 	 Number of patients: 956 in biomed data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
merged_df.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,Stage IB,Stage IC,Stage IIA,Stage IIB,Stage IIC,Stage IIIA,Stage IIIB,Stage IIIC,Stage IV,RECURRENCE
TCGA-04-1331-01,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,-0.703,...,0,0,0,0,0,0,0,1,0,0
TCGA-04-1336-01,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,0.101,...,0,0,0,0,0,0,1,0,0,0
TCGA-04-1341-01,-0.421,-0.421,-0.421,-0.421,-0.421,-0.421,-0.421,-0.421,-0.421,-0.421,...,0,0,0,0,0,0,0,0,0,0
TCGA-04-1342-01,0.089,0.089,0.089,0.089,0.089,0.089,0.089,0.089,0.089,0.089,...,0,0,0,0,0,0,0,0,1,0
TCGA-04-1343-01,0.279,0.279,0.279,0.279,0.279,0.279,0.279,0.279,0.279,0.279,...,0,0,0,0,0,0,0,0,1,0
