## Imports

In [1]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

## Getting Started

In [2]:
# Set environment variables to optimize multithreading
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["OPENBLAS_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["VECLIB_MAXIMUM_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"

## Getting relavent probes

In [3]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_primary_window_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg08327371',
 'cg12233379',
 'cg15103584',
 'cg25001305',
 'cg06497198',
 'cg01950958',
 'cg17015994',
 'cg21778810',
 'cg00621240',
 'cg08566882',
 'cg24222995',
 'cg22645961',
 'cg09239150',
 'cg11523905',
 'cg24955682',
 'cg24809640',
 'cg24515341',
 'cg15855336',
 'cg00829845',
 'cg06629999',
 'cg04889688',
 'cg22959611',
 'cg08645400',
 'cg01057268',
 'cg03322779',
 'cg09228949',
 'cg16765387',
 'cg08028651',
 'cg08582801',
 'cg26233914',
 'cg09136346',
 'cg20697984',
 'cg14383135',
 'cg26873164',
 'cg22396033',
 'cg13624833',
 'cg26227569',
 'cg14234898',
 'cg04198941',
 'cg10848367',
 'cg25440497',
 'cg15999609',
 'cg04335207',
 'cg17880816',
 'cg14994183',
 'cg02690571',
 'cg16454902',
 'cg26464945',
 'cg19590591',
 'cg17601658',
 'cg24221853',
 'cg02805871',
 'cg22596049',
 'cg21724798',
 'cg27200315',
 'cg12050706',
 'cg14732248',
 'cg22089890',
 'cg03372852',
 'cg22677556',
 'cg10500909',
 'cg02081938',
 'cg17378803',
 'cg18753619',
 'cg04722059',
 'cg23385492',
 'cg064950

In [4]:
# List of column names to search for
column_names_to_find = [
    "icgc_sample_id",
    "icgc_specimen_id",
] + probe_ids

# Read the TSV file
file_path = "../_INPUTS_/first_line_wide.tsv"
data = pd.read_csv(file_path, sep='\t')

# Function to find the index of columns in the DataFrame
def find_column_index(column):
    if column in data.columns:
        return column, data.columns.get_loc(column)
    return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_prim.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_prim.txt


In [9]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_corr_SITH_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg12164232',
 'cg27086014',
 'cg23346773',
 'cg09159022',
 'cg22799420',
 'cg05431842',
 'cg07167872',
 'cg00148114',
 'cg14159672',
 'cg08564601',
 'cg21806273',
 'cg08883213',
 'cg17669009',
 'cg17131579',
 'cg10947611',
 'cg24161793',
 'cg12859429',
 'cg18507732',
 'cg17877220',
 'cg02507726',
 'cg24503407',
 'cg26651514',
 'cg02401556',
 'cg08548559',
 'cg16723180',
 'cg08205236',
 'cg12255995',
 'cg10081681',
 'cg17178900',
 'cg20328167',
 'cg23214071',
 'cg01079658',
 'cg21209395',
 'cg00590029',
 'cg23837623',
 'cg20439022',
 'cg03039315',
 'cg21007852',
 'cg11423822',
 'cg02803847',
 'cg24917131',
 'cg02403180',
 'cg16578537',
 'cg11138227',
 'cg04083712',
 'cg02460141',
 'cg25969123',
 'cg13217590',
 'cg17945429',
 'cg09773473',
 'cg26317555',
 'cg20442232',
 'cg11862731',
 'cg09319828',
 'cg15579816',
 'cg11538407',
 'cg27143872',
 'cg19477190',
 'cg00870279',
 'cg10822582',
 'cg14893161',
 'cg00155314',
 'cg03034634',
 'cg03888083',
 'cg02363934',
 'cg15967169',
 'cg001874

In [10]:
# List of column names to search for
column_names_to_find = [
    "icgc_sample_id",
    "icgc_specimen_id",
] + probe_ids

# Read the TSV file
# file_path = "../_INPUTS_/first_line_wide.tsv"
# data = pd.read_csv(file_path, sep='\t')

# Function to find the index of columns in the DataFrame
# def find_column_index(column):
#     if column in data.columns:
#         return column, data.columns.get_loc(column)
#     return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_sith_corr.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_sith_corr.txt


In [11]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_corr_INT_IQR_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg25706012',
 'cg12294026',
 'cg16338877',
 'cg12197752',
 'cg05298224',
 'cg17142722',
 'cg07475178',
 'cg10318458',
 'cg09406615',
 'cg05411132',
 'cg07584855',
 'cg18886062',
 'cg16060204',
 'cg07702853',
 'cg16127145',
 'cg12666263',
 'cg14333779',
 'cg26351001',
 'cg19715094',
 'cg23405696',
 'cg16264722',
 'cg13054358',
 'cg21410293',
 'cg00960580',
 'cg09578353',
 'cg15694715',
 'cg21142743',
 'cg20001829',
 'cg06300469',
 'cg24004532',
 'cg14459046',
 'cg09048334',
 'cg14313275',
 'cg03510349',
 'cg27090029',
 'cg14166284',
 'cg13976219',
 'cg13765206',
 'cg03651613',
 'cg01861555',
 'cg02418439',
 'cg05740418',
 'cg12164232',
 'cg01703581',
 'cg22620414',
 'cg16401270',
 'cg26788570',
 'cg27217748',
 'cg13613965',
 'cg05537653',
 'cg18472787',
 'cg17753475',
 'cg10783797',
 'cg03895404',
 'cg24595261',
 'cg13085030',
 'cg11685843',
 'cg10013645',
 'cg25754926',
 'cg25830696',
 'cg10153942',
 'cg08727443',
 'cg08457232',
 'cg03757398',
 'cg08377768',
 'cg09944626',
 'cg109829

In [12]:
# List of column names to search for
column_names_to_find = [
    "icgc_sample_id",
    "icgc_specimen_id",
] + probe_ids

# # Read the TSV file
# file_path = "../_INPUTS_/first_line_wide.tsv"
# data = pd.read_csv(file_path, sep='\t')

# # Function to find the index of columns in the DataFrame
# def find_column_index(column):
#     if column in data.columns:
#         return column, data.columns.get_loc(column)
#     return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_iqr_corr.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_iqr_corr.txt


## Finding Common probe ids with sith window, sith correlation, iqr correlation

In [13]:
# Extract probe IDs from the uploaded files
file_paths = [
    '../_OUTPUTS_/found_columns_iqr_corr.txt',
    '../_OUTPUTS_/found_columns_prim.txt',
    '../_OUTPUTS_/found_columns_sith_corr.txt'
]

# Initialize a dictionary to hold the probe IDs
probe_ids = {}

for i, file_path in enumerate(file_paths):
    with open(file_path, 'r') as file:
        # Extract probe IDs (before 'Index:')
        ids = [line.split('Column: ')[1].split(',')[0].strip() for line in file if 'Column:' in line]
        probe_ids[f'file_{i+1}'] = set(ids)

# Find common probe IDs across all three files
common_probe_ids = set.intersection(*probe_ids.values())
common_probe_ids


{'cg27086014', 'icgc_sample_id', 'icgc_specimen_id'}

In [14]:
# Calculate common probe IDs for each pair of files
from itertools import combinations

pairwise_common = {}

# Generate all pairs of file keys
file_keys = list(probe_ids.keys())
for pair in combinations(file_keys, 2):
    common_ids = probe_ids[pair[0]].intersection(probe_ids[pair[1]])
    pairwise_common[pair] = common_ids

pairwise_common

{('file_1', 'file_2'): {'cg27086014', 'icgc_sample_id', 'icgc_specimen_id'},
 ('file_1', 'file_3'): {'cg00870279',
  'cg01842321',
  'cg01861555',
  'cg06805320',
  'cg07235253',
  'cg08532569',
  'cg08598483',
  'cg10982913',
  'cg12164232',
  'cg14387626',
  'cg16272777',
  'cg16338877',
  'cg16401270',
  'cg17163967',
  'cg19477190',
  'cg21142743',
  'cg22635155',
  'cg27086014',
  'icgc_sample_id',
  'icgc_specimen_id'},
 ('file_2', 'file_3'): {'cg27086014', 'icgc_sample_id', 'icgc_specimen_id'}}

In [17]:
# Load the datasets
pcawg_prim_window_meth_df = pd.read_csv("../_INPUTS_/meth_cols_pcawg_prim_window.tsv", sep='\t')
pcawg_sith_corr_meth_df = pd.read_csv("../_INPUTS_/meth_cols_pcawg_sith_corr.tsv", sep='\t')
pcawg_iqr_corr_meth_df = pd.read_csv("../_INPUTS_/meth_cols_pcawg_iqr_corr.tsv", sep='\t')

pcawg_sith_df = pd.read_csv("../_INPUTS_/pcawg_sith.tsv", sep='\t')

In [20]:
# Merge the datasets on the first column (assumed to be the index 0)
pcawg_prim_window_meth_df_merged = pd.merge(pcawg_sith_df, pcawg_prim_window_meth_df, on=pcawg_sith_df.columns[0])
# Save the merged dataset to a new TSV file
pcawg_prim_window_meth_df_merged.to_csv("../_OUTPUTS_/merged_sith_meth_pcawg_prim_window.tsv", sep='\t', index=False)
# Display the first few rows of the merged dataset
print(pcawg_prim_window_meth_df_merged.head())

# Merge the datasets on the first column (assumed to be the index 0)
pcawg_sith_corr_meth_df_merged = pd.merge(pcawg_sith_df, pcawg_sith_corr_meth_df, on=pcawg_sith_df.columns[0])
# Save the merged dataset to a new TSV file
pcawg_sith_corr_meth_df_merged.to_csv("../_OUTPUTS_/merged_sith_meth_pcawg_sith_corr.tsv", sep='\t', index=False)
# Display the first few rows of the merged dataset
print(pcawg_sith_corr_meth_df_merged.head())

# Merge the datasets on the first column (assumed to be the index 0)
pcawg_iqr_corr_meth_df_merged = pd.merge(pcawg_sith_df, pcawg_iqr_corr_meth_df, on=pcawg_sith_df.columns[0])
# Save the merged dataset to a new TSV file
pcawg_iqr_corr_meth_df_merged.to_csv("../_OUTPUTS_/merged_sith_meth_pcawg_iqr_corr.tsv", sep='\t', index=False)
# Display the first few rows of the merged dataset
print(pcawg_iqr_corr_meth_df_merged.head())


  icgc_sample_id icgc_specimen_id_x sample_type  sample_code  \
0       SA528682           SP110770     primary            1   
1       SA506693           SP102597     primary            1   
2       SA411018            SP71576     primary            1   
3       SA514921           SP106753     primary            1   
4       SA505863           SP101678     primary            1   

  donor_vital_status  survival_code      SITH   INT_IQR  CLTR_IQR   maxSITH  \
0           deceased              1  0.622069  0.292670  0.224504  0.622069   
1              alive              0  0.581406  0.213151  0.316149  0.581406   
2              alive              0  0.684766  0.247213  0.257797  0.684766   
3              alive              0  0.715409  0.216627  0.351754  0.715409   
4           deceased              1  0.435784  0.290722  0.365138  0.435784   

   ...  cg09882550  cg09363443  cg03076771  cg06083652  cg03951877 cg10281502  \
0  ...    0.189549    0.095719    0.379200    0.560477    0

In [21]:
# Generate summary statistics
summary_statistics = pcawg_prim_window_meth_df_merged.describe()

# Display the summary statistics
print(summary_statistics)

       sample_code  survival_code        SITH     INT_IQR    CLTR_IQR  \
count   193.000000     193.000000  193.000000  193.000000  188.000000   
mean      1.119171       0.647668    0.642976    0.258625    0.325562   
std       0.490849       0.478939    0.142042    0.071064    0.128388   
min       1.000000       0.000000    0.310901    0.000006    0.000000   
25%       1.000000       0.000000    0.530214    0.225575    0.242920   
50%       1.000000       1.000000    0.632354    0.267920    0.323711   
75%       1.000000       1.000000    0.760912    0.310190    0.398614   
max       4.000000       1.000000    0.999212    0.380745    0.749930   

          maxSITH  is_maxSITH  maxINT_IQR  is_maxINT_IQR  is_maxCLTR_IQR  ...  \
count  193.000000       193.0  193.000000          193.0      193.000000  ...   
mean     0.642976         1.0    0.258625            1.0        0.974093  ...   
std      0.142042         0.0    0.071064            0.0        0.159270  ...   
min      0.310901 