## Random subsets of samples and SNPs

In [1]:
import pandas as pd
import random
import os

In [2]:
# Change working directory
os.chdir('/media/HDD_4TB_1/jordi/cfuses_gnn_enrollhd_2024/')

# Input and output directories
data_dir = "data/features/subsetting/"

In [3]:
# Load samples
samples_col = pd.read_csv(data_dir + "id_samples.txt")

# Load snps IDs order file
with open(data_dir + 'refIDs_row.txt', 'r') as file:
    line = file.readline().strip()
    snps_vector = line.split('\t')

In [4]:
# Sample size
N = 900

# SNPs included
j = 70000

In [30]:
# Get random samples
random_samples = random.sample(samples_col['FID_IID'].tolist(), k=N)

# Sort the list in the original order
random_samples = sorted(random_samples, key=lambda x: samples_col['FID_IID'].tolist().index(x))

# Construc the new samples column
random_samples_col = ['FID_IID'] + random_samples

In [31]:
# Save random samples column
with open(data_dir + 'random_samples_pc10.txt', 'w') as file:
    # Write each element of the list to a new line in the file
    for item in random_samples_col:
        file.write(item + '\n')

In [6]:
# Get snps described in literature as HD modifiers
HDmodifiers = pd.read_csv('data/genes/HD_modifiers.txt', header=None)

# Load SNP lookup table
lookuptab = pd.read_csv('data/biomart/revised_filtered_snp_gene_lookup_tab.txt', sep='\t')

# Empty list to store constant snps
HDmodifiers_snps = []
HDmodifiers_snps_idxs = []

# Find SNPs corresponding to HD modifiers
for gene in HDmodifiers[0]:
    mod_snps = lookuptab[lookuptab['gene']==gene]
    
    # Variable to count how many snps from this gene are found in feature matrix
    snps_gene_count = 0

    # Get indexes of HD modifiers SNPs
    for snp in mod_snps['refsnp_id']:

        # If this snp is in our data
        if snp in snps_vector:

            # Append index of snp in snp_vector (header of feature matrix)
            HDmodifiers_snps_idxs.append(snps_vector.index(snp))

            # Append snp name
            HDmodifiers_snps.append(snp)

            # Increase snp counting variable
            snps_gene_count += 1

        else:
            continue
    
    # Print what gene was included and how many snps
    print(gene, ':', snps_gene_count)

HTT : 537
MLH1 : 212
MLH3 : 97
GRIK2 : 2618
GRIN2A : 2216
GRIN2B : 1909
UCHL1 : 54
APOE : 7
ASK1 : 0
MAP3K5 : 661
PPARGC1A : 593


In [9]:
# Join list elements with tabs
HDmodifiers_snps_string = '\t'.join(map(str,HDmodifiers_snps))

# Save modifiers indexes
with open(data_dir + 'snps_modifiers.txt', 'w') as file:
    file.write(HDmodifiers_snps_string)

In [8]:
# snps indices
snps_vector_idxs = [i for i in range(len(snps_vector))]

# Get random snps indices
random_snps_idxs = random.sample([x for x in snps_vector_idxs if x not in HDmodifiers_snps_idxs], k=(j-len(HDmodifiers_snps_idxs)))

# Paste HD modifiers indices and random indices
random_snps_idxs = HDmodifiers_snps_idxs + random_snps_idxs

# Order snps indices
random_snps_idxs.sort()

# Save names of selected snps
#random_snps = [snps_vector[i] for i in random_snps_idxs]

# Add 4 considering that the subset is done on the big matrix where we start 
# from index 1 (here we start from 0), and the first 3 cols aren't snps.
random_snps_idxs = [x + 4 for x in random_snps_idxs]

# Add first three columns
random_feature_matrix_columns_idxs = [1,2,3] + random_snps_idxs

In [9]:
# Join list elements with tabs
random_feature_matrix_columns_idxs_string = '\t'.join(map(str,random_feature_matrix_columns_idxs))

# Save 
with open(data_dir + 'snps_idxs_pc10.txt', 'w') as file:
    file.write(random_feature_matrix_columns_idxs_string)