## Imports

In [1]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

## Getting Started

In [2]:
# Set environment variables to optimize multithreading
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["OPENBLAS_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["VECLIB_MAXIMUM_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"

## Getting relavent probes

In [3]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_primary_window_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg22626041',
 'cg20511194',
 'cg18870413',
 'cg27109614',
 'cg02274500',
 'cg14826215',
 'cg12336999',
 'cg17295864',
 'cg10044466',
 'cg26015683',
 'cg24145210',
 'cg06967703',
 'cg23333220',
 'cg25508118',
 'cg04900915',
 'cg17026874',
 'cg18756868',
 'cg06085713',
 'cg24258013',
 'cg19857633',
 'cg24882097',
 'cg22376582',
 'cg24870846',
 'cg17149431',
 'cg05815906',
 'cg21319693',
 'cg08268266',
 'cg00572356',
 'cg19408535',
 'cg23724117',
 'cg08081036',
 'cg06030649',
 'cg15602841',
 'cg01198434',
 'cg14265813',
 'cg00196475',
 'cg03953257',
 'cg01573732',
 'cg18419694',
 'cg26971330',
 'cg24509331',
 'cg09506515',
 'cg09304272',
 'cg13980308',
 'cg02443062',
 'cg16720675',
 'cg08592733',
 'cg12530017',
 'cg03157226',
 'cg09874600',
 'cg18892537',
 'cg14290451',
 'cg07050602',
 'cg14189304',
 'cg23220823',
 'cg23474190',
 'cg19708576',
 'cg04212239',
 'cg19888787',
 'cg25997426',
 'cg00155593',
 'cg06186683',
 'cg16590546',
 'cg26551688',
 'cg04928129',
 'cg14342823',
 'cg164118

In [4]:
# List of column names to search for
column_names_to_find = probe_ids

# Read the TSV file
file_path = "../_INPUTS_/tgca_first_line_wide.tsv"
data = pd.read_csv(file_path, sep='\t')

# Function to find the index of columns in the DataFrame
def find_column_index(column):
    if column in data.columns:
        return column, data.columns.get_loc(column)
    return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_prim.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_prim.txt


In [5]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_corr_SITH_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg04211309',
 'cg06815826',
 'cg06476438',
 'cg27292099',
 'cg10758824',
 'cg12592365',
 'cg13258831',
 'cg15869649',
 'cg23555395',
 'cg13861244',
 'cg24522654',
 'cg14871010',
 'cg21236153',
 'cg18096251',
 'cg13028819',
 'cg04368843',
 'cg10525642',
 'cg16881960',
 'cg17385247',
 'cg24860169',
 'cg00685314',
 'cg25466245',
 'cg16637458',
 'cg00075956',
 'cg10437224',
 'cg08578641',
 'cg05181941',
 'cg13977620',
 'cg01404122',
 'cg07887608',
 'cg19238349',
 'cg21179255',
 'cg14524643',
 'cg15521790',
 'cg25198599',
 'cg07125112',
 'cg01883662',
 'cg25161029',
 'cg17924044',
 'cg10759936',
 'cg00598204',
 'cg13663644',
 'cg15770553',
 'cg07155455',
 'cg09926486',
 'cg10135474',
 'cg07384894',
 'cg13778336',
 'cg01462537',
 'cg24159636',
 'cg17273588',
 'cg15975171',
 'cg03191504',
 'cg01558916',
 'cg03354616',
 'cg15476396',
 'cg19082559',
 'cg04021672',
 'cg06743811',
 'cg14025373',
 'cg25214376',
 'cg16901566',
 'cg16686960',
 'cg14408823',
 'cg13589810',
 'cg15067713',
 'cg251276

In [6]:
# List of column names to search for
column_names_to_find = probe_ids

# Read the TSV file
# file_path = "../_INPUTS_/first_line_wide.tsv"
# data = pd.read_csv(file_path, sep='\t')

# Function to find the index of columns in the DataFrame
# def find_column_index(column):
#     if column in data.columns:
#         return column, data.columns.get_loc(column)
#     return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_sith_corr.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_sith_corr.txt


In [7]:
# Load the CSV file and extract the first 250 values of "probe_id"
file_path_csv = "../_OUTPUTS_/dif_res_corr_INT_IQR_shifted.csv"

# Load file.csv and extract first 250 "probe_id" values
file_csv = pd.read_csv(file_path_csv)
probe_ids = file_csv["probe_id"].iloc[:250].tolist()


probe_ids

['cg04234597',
 'cg25008147',
 'cg17163729',
 'cg13028819',
 'cg01708202',
 'cg09337427',
 'cg02589576',
 'cg23619936',
 'cg21711862',
 'cg19905414',
 'cg17772793',
 'cg07125967',
 'cg15521790',
 'cg23251701',
 'cg16278514',
 'cg06552160',
 'cg04152326',
 'cg20649298',
 'cg20064139',
 'cg00111778',
 'cg07075840',
 'cg15696460',
 'cg07155455',
 'cg00491523',
 'cg23944920',
 'cg22626548',
 'cg20040972',
 'cg11697660',
 'cg20957776',
 'cg07125112',
 'cg17924044',
 'cg16172619',
 'cg25481636',
 'cg07877987',
 'cg19942237',
 'cg14935163',
 'cg09062961',
 'cg23239637',
 'cg07376374',
 'cg00040862',
 'cg10325383',
 'cg08933467',
 'cg24472495',
 'cg21072251',
 'cg26912251',
 'cg07910488',
 'cg27583071',
 'cg18565809',
 'cg24838949',
 'cg27044861',
 'cg06460888',
 'cg13973124',
 'cg10118717',
 'cg27498197',
 'cg22330924',
 'cg26990023',
 'cg19033555',
 'cg26246590',
 'cg01445411',
 'cg20679626',
 'cg20971474',
 'cg23063740',
 'cg23487759',
 'cg20596629',
 'cg12680131',
 'cg25493276',
 'cg064556

In [8]:
# List of column names to search for
column_names_to_find = probe_ids

# # Read the TSV file
# file_path = "../_INPUTS_/first_line_wide.tsv"
# data = pd.read_csv(file_path, sep='\t')

# # Function to find the index of columns in the DataFrame
# def find_column_index(column):
#     if column in data.columns:
#         return column, data.columns.get_loc(column)
#     return column, None

# Use ThreadPoolExecutor for multithreading
found_columns = {}
with ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    futures = {executor.submit(find_column_index, col): col for col in column_names_to_find}
    
    # Collect results
    for future in as_completed(futures):
        column, index = future.result()
        if index is not None:
            found_columns[column] = index

# Save the results to a text file
output_file_path = "../_OUTPUTS_/found_columns_iqr_corr.txt"
with open(output_file_path, 'w') as output_file:
    for column, index in found_columns.items():
        output_file.write(f"Column: {column}, Index: {index}\n")

print(f"Output saved to {output_file_path}")


Output saved to ../_OUTPUTS_/found_columns_iqr_corr.txt


## Finding Common probe ids with sith window, sith correlation, iqr correlation

In [9]:
# Extract probe IDs from the uploaded files
file_paths = [
    '../_OUTPUTS_/found_columns_iqr_corr.txt',
    '../_OUTPUTS_/found_columns_prim.txt',
    '../_OUTPUTS_/found_columns_sith_corr.txt'
]

# Initialize a dictionary to hold the probe IDs
probe_ids = {}

for i, file_path in enumerate(file_paths):
    with open(file_path, 'r') as file:
        # Extract probe IDs (before 'Index:')
        ids = [line.split('Column: ')[1].split(',')[0].strip() for line in file if 'Column:' in line]
        probe_ids[f'file_{i+1}'] = set(ids)

# Find common probe IDs across all three files
common_probe_ids = set.intersection(*probe_ids.values())
common_probe_ids


set()

In [10]:
# Calculate common probe IDs for each pair of files
from itertools import combinations

pairwise_common = {}

# Generate all pairs of file keys
file_keys = list(probe_ids.keys())
for pair in combinations(file_keys, 2):
    common_ids = probe_ids[pair[0]].intersection(probe_ids[pair[1]])
    pairwise_common[pair] = common_ids

pairwise_common

{('file_1', 'file_2'): set(),
 ('file_1', 'file_3'): {'cg00075956',
  'cg00685314',
  'cg00778993',
  'cg06455686',
  'cg07125112',
  'cg07155455',
  'cg09926486',
  'cg12130607',
  'cg12340343',
  'cg13028819',
  'cg13258831',
  'cg14186245',
  'cg15521790',
  'cg15696460',
  'cg16901566',
  'cg17831440',
  'cg17924044',
  'cg18565809',
  'cg19033555',
  'cg19407717',
  'cg20064139',
  'cg21962907',
  'cg27604145'},
 ('file_2', 'file_3'): set()}