# Use LD Proxy API to find the relevant positions for each of the PRS313 SNPs

In [3]:
import requests
import pandas as pd
import os
import io


# API key
api_key = "ac16be4ad92d"

# Base URL for LDProxy API
base_url = "https://ldlink.nih.gov/LDlinkRest/ldproxy"

population = "ALL"

window = 1000000

r2_threshold = 0.01

# Read the chromosome positions from the text file
PRS313_LD = pd.read_excel("../../Data/PRS313_with_23andMe.xlsx")


In [4]:

# Create a folder to store the CSV files
output_folder = "../ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each position
for sample in PRS313_LD:
    if sample["in_23andMe"] == True:
         continue

    # Extract chromosome and position from the line
    chrom = sample["Chromosome"]
    pos = sample["Positionb"]
    
    # Construct the API request URL
    url = f"{base_url}?var={chrom}:{pos}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"
    
    # Send the API request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
            # Create a StringIO object from the data
            data_io = io.StringIO(response.text)

            # Read the data into a DataFrame using read_csv
            df = pd.read_csv(data_io, sep='\\t')
            
            # # Filter variants with high LD scores (e.g., R2 >= 0.8)
            # high_ld_variants = df[df["R2"].astype(float) >= 0.8]
            
            # Generate a unique filename for the CSV file
            output_file = os.path.join(output_folder, f"{chrom}_{pos}.csv")
            
            # Save the high LD variants to a CSV file
            df.to_csv(output_file, index=False)
            
            print(f"Saved high LD variants for {chrom}:{pos} to {output_file}")
    else:
        print(f"Failed to retrieve data for {chrom}:{pos}. Status code: {response.status_code}")

TypeError: string indices must be integers, not 'str'

In [None]:
chrom, pos = positions[0].strip().split()

# Construct the API request URL
url = f"{base_url}?var={chrom}:{pos}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"

# Send the API request
response = requests.get(url)

In [None]:


# Remove the leading single quote
data = response.text

# Create a StringIO object from the data
data_io = io.StringIO(data)

# Read the data into a DataFrame using read_csv
df = pd.read_csv(data_io, sep='\\t')
df

  df = pd.read_csv(data_io, sep='\\t')


Unnamed: 0,RS_Number,Coord,Alleles,MAF,Distance,Dprime,R2,Correlated_Alleles,FORGEdb,RegulomeDB,Function
0,rs612683,chr1:100880328,(A/T),0.4004,0,1.0000,1.0000,"A=A,T=T",9.0,2b,
1,rs12568038,chr1:100879914,(C/T),0.3415,-414,0.8908,0.6162,"A=C,T=T",6.0,7,
2,rs12564838,chr1:100883188,(A/G),0.3496,2860,0.8609,0.5968,"A=A,T=G",6.0,5,
3,rs7517704,chr1:100885419,(A/G),0.3494,5091,0.8609,0.5962,"A=A,T=G",6.0,6,
4,rs12760924,chr1:100898600,(A/T),0.3546,18272,0.8357,0.5748,"A=A,T=T",6.0,6,
...,...,...,...,...,...,...,...,...,...,...,...
1214,rs111334083,chr1:100617033,(G/A),0.0212,-263295,0.8351,0.0101,,7.0,5,
1215,rs141366089,chr1:100950426,(-/T),0.0575,70098,0.4970,0.0101,,,.,
1216,rs10493930,chr1:101079706,(T/C),0.0833,199378,0.4070,0.0100,,6.0,5,
1217,rs57411954,chr1:100712556,(AC/-),0.0276,-167772,0.7285,0.0100,,,.,


# Find overlap b/w 23AndMe and LD


In [5]:
# Separate files in one directory into multiple folders based on chromosome number. The files are named chromosomeNumber_chromosomePosition.csv

directory = "../../Data/ld_variants"

# Create a folder to store the CSV files
output_folder = "../../Data/ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each file in the directory

for filename in os.listdir(directory):
    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Extract the chromosome number and position from the filename
        chrom, pos = filename.split("_")
        pos = pos.split(".")[0]
        
        # Create a folder for the chromosome number
        chrom_folder = os.path.join(output_folder, "chr" + chrom)
        os.makedirs(chrom_folder, exist_ok=True)
        
        # Move the file to the chromosome folder
        src = os.path.join(directory, filename)
        dest = os.path.join(chrom_folder, filename)
        os.rename(src, dest)
        
        print(f"Moved {filename} to {chrom_folder}")

Moved 12_83064195.csv to ../../Data/ld_variants\chr12


ValueError: not enough values to unpack (expected 2, got 1)

# Creating Final Training Data with LD Proxy 

In [4]:
import os
import pandas as pd
import re

directory = "../../Data/ld_variants"

# Create a folder to store the CSV files
output_folder = "../../Data/ld_variants"
os.makedirs(output_folder, exist_ok=True)


def extract_coord_alleles(col_name):
    match = re.search(r'chr(\d+)_(\d+)_([ACGT]+)_([ACGT,]+)', col_name)
    if match:
        chr_num, position, ref_allele, alt_alleles = match.groups()
        return f'chr{chr_num}:{position}', f'({ref_allele}/{alt_alleles})'
    return None, None

# Create an empty list to store the matching variants across all chromosomes
matching_variants_all = []

# Iterate over each chromosome folder
for chrom_folder in os.listdir(output_folder):
    # Check if the folder is a chromosome folder
    if chrom_folder.startswith("chr"):
        # Extract the chromosome number
        chrom = chrom_folder[3:]

        files_folder = os.path.join(output_folder, chrom_folder)
        
        # Load the training data for the chromosome
        training_data = pd.read_parquet(f"../../Data/Raw_training_data_23andMe_union/23AndMe_PRS313_merged_chr{chrom}.parquet")
        
        # Create empty lists to store the matching columns and variants for the chromosome
        matching_columns = []
        matching_variants_chrom = []
        not_found_snps_chrom = []

        # Read all the files in the folder
        for filename in os.listdir(files_folder):

            # Check if the file is a CSV file
            if filename.endswith(".csv"):

                # Check if the position is an unknown position or a known position 
                position = filename.split('.')[0]

                # Find the column name with the position
                position_column = training_data.columns[training_data.columns.str.contains(position)]

                if (position_column[0].split("_")[-1] == "Known"):
                    # If the position is a known position, skip the file, because we don't need LD proxies for known positions
                    print(f"Skipping LD for {filename} because it is a known position. Column added to training data")

                    if position_column[0] not in matching_columns:

                        # Add the matching column to the list
                        matching_columns.append(position_column[0])

                    continue


                # Load the variants for the file
                variants = pd.read_csv(os.path.join(files_folder, filename))
                
                # Find the matching columns in the training data
                for column in training_data.columns:
                    coord, alleles = extract_coord_alleles(column)
                    if coord is not None and alleles is not None:
                        try:
                            # Got rid of allele matching, because of coding differences. GC--> C (in PRS313) is coded as (C/-) in dbSNP and LDProxy
                            mask = (variants['Coord'] == coord)
                            # mask = (variants['Coord'] == coord) & (variants['Alleles'] == alleles)
                                
                            if mask.any():
                                # Only append column if it has not been added before
                                if column not in matching_columns:

                                    # Check column added is an unknown position
                                    if (column.split("_")[-1] == "Unknown"):
                                        print(column)


                                    matching_columns.append(column)
                                    matching_variants_chrom.append(variants[mask])
                        except KeyError:
                            error_position = filename.split('.')[0]
                            not_found_snps_chrom.append(error_position)
                            print(f"SNP {error_position} not found in dbSNP and cannot be proxied using LDProxy")

                            # Find the columns in training_data.columns with positions within +/- 500K BP of the error_position
                            counter_error_added = 0
                            for col in training_data.columns:
                                coord, _ = extract_coord_alleles(col)
                                if coord is not None:
                                    col_position = int(coord.split(':')[1])
                                    error_bp = int(error_position.split('_')[1])
                                    if abs(col_position - error_bp) <= 500000 and col not in matching_columns:
                                        counter_error_added += 1
                                        matching_columns.append(col)

                            print(f"Added {counter_error_added} columns to matching_columns for missing data position: {error_position}")
                            
                            break

                print(f"Processed {filename}")
                print(f"Found {len(matching_columns)} matching columns")
                

        # Get the matching columns from the training data
        matching_data = training_data[matching_columns]

        folder = "../../Data/Filtered_training_data_union/"
        os.makedirs(folder, exist_ok=True)
        save_path = f"{folder}/23AndMe_PRS313_merged_chr{chrom}_matching.parquet"
        # Save the matching data for the chromosome
        matching_data.to_parquet(save_path)

        # Concatenate the matching variants for the chromosome into a single dataframe
        matching_variants_chrom_df = pd.concat(matching_variants_chrom, ignore_index=True)
        
        # Append the matching variants for the chromosome to the overall list
        matching_variants_all.append(matching_variants_chrom_df)

        print(f"Saved to file {save_path}")
        print(f"Found {len(matching_columns)} matching columns")
        print(f"Found {len(matching_variants_chrom_df)} matching variants")

# Concatenate the matching variants from all chromosomes into a single dataframe
matching_variants_all_df = pd.concat(matching_variants_all, ignore_index=True)

# Save the dataframe with matching variants across all chromosomes
matching_variants_all_df.to_csv("../../Data/Filtered_training_data/23AndMe_matching_variants.csv", index=False)

Skipping LD for 12_14413931.csv because it is a known position. Column added to training data
chr12_115796577_A_G_PRS313_Unknown
Processed 12_115796577.csv
Found 27 matching columns
Skipping LD for 12_115835836.csv because it is a known position. Column added to training data
Skipping LD for 12_120832146.csv because it is a known position. Column added to training data
chr12_103097887_C_T_PRS313_Unknown
Processed 12_103097887.csv
Found 60 matching columns
Skipping LD for 12_96027759.csv because it is a known position. Column added to training data
chr12_85004551_C_T_PRS313_Unknown
Processed 12_85004551.csv
Found 121 matching columns
chr12_111600134_G_T_PRS313_Unknown
Processed 12_111600134.csv
Found 191 matching columns
chr12_28149568_C_T_PRS313_Unknown
Processed 12_28149568.csv
Found 224 matching columns
chr12_29140260_G_A_PRS313_Unknown
Processed 12_29140260.csv
Found 264 matching columns
Skipping LD for 12_57146069.csv because it is a known position. Column added to training data
ch

OSError: Cannot save file into a non-existent directory: '../../Data/Filtered_training_data_union'

In [5]:
import requests
import pandas as pd
import os
import io

# API key
api_key = "ac16be4ad92d"

# Base URL for LDProxy API
base_url = "https://ldlink.nih.gov/LDlinkRest/ldproxy"

population = "ALL"
window = 1000000
r2_threshold = 0.01

# Read the RSIDs from the text file
with open("../positions/Missing_ld.txt", "r") as file:
    rsids = file.read().strip().split("\n")

# Create a folder to store the CSV files
output_folder = "../ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each RSID
for rsid in rsids:
    # Construct the API request URL
    url = f"{base_url}?var={rsid}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"

    # Send the API request
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a StringIO object from the data
        data_io = io.StringIO(response.text)

        # Read the data into a DataFrame using read_csv
        df = pd.read_csv(data_io, sep='\\t')

        # Generate a unique filename for the CSV file
        output_file = os.path.join(output_folder, f"{rsid}.csv")

        # Save the high LD variants to a CSV file
        df.to_csv(output_file, index=False)

        print(f"Saved high LD variants for {rsid} to {output_file}")
    else:
        print(f"Failed to retrieve data for {rsid}. Status code: {response.status_code}")

FileNotFoundError: [Errno 2] No such file or directory: '../positions/Missing_ld.txt'

# Create Final Training Data with +/- 500K BP Window

In [19]:
import os
import pandas as pd
import re

window_size = 250000
pattern = re.compile(r"chr\d+_(\d+)_")

# Create an empty list to store the matching variants across all chromosomes
matching_variants_all = []

for chrom in range(1, 23):
    # Load the training data for the chromosome
    training_data = pd.read_parquet(f"../../Data/Raw_training_data/23AndMe_PRS313_merged_chr{chrom}.parquet")
    
    # Get all columns with "PRS313" in the name
    prs313_unknown_columns = [col for col in training_data.columns if "PRS313_Unknown" in col]
    prs313_unknown_positions = [int(pattern.search(col).group(1)) for col in prs313_unknown_columns]
    prs313_unknown_positions_set = set(prs313_unknown_positions)
    
    # Get all columns in training_data that contain a number within +/- 500k of the PRS313_Unknown position
    filtered_columns = [col for col in training_data.columns if any(abs(int(pattern.search(col).group(1)) - pos) <= window_size for pos in prs313_unknown_positions_set)]
    
    training_data_filtered = training_data[filtered_columns]

    print(len(filtered_columns))

    # Save the filtered training data for the chromosome
    # training_data_filtered.to_parquet(f"../../Data/500k_window_filtered_data/23AndMe_PRS313_merged_chr{chrom}_filtered.parquet")

4138
3266
3217
2468
5178
3592
2290
2740
1953
2854
3720
2333
861
806
1252
3042
1565
1537
1847
1127
575
2007


In [28]:
chrom = 7

training_data_window = pd.read_parquet(f"../../Data/500k_window_filtered_data/23AndMe_PRS313_merged_chr{chrom}_filtered.parquet")
print(training_data_window.shape)

training_data_ld_proxy = pd.read_parquet(f"../../Data/filtered_training_data/23AndMe_PRS313_merged_chr{chrom}_matching.parquet")
print(training_data_ld_proxy.shape)

(2504, 2290)
(2504, 444)


In [17]:
training_data.columns

Index(['chr1_88177403_G_A', 'chr1_88127152_T_C', 'chr1_88208135_G_A',
       'chr1_88109828_G_A', 'chr1_88086894_C_A', 'chr1_88091734_C_T',
       'chr1_88116467_A_C', 'chr1_88220810_AC_A', 'chr1_88073752_G_T',
       'chr1_88120134_T_C',
       ...
       'chr1_172366806_A_G', 'chr1_172419651_T_G', 'chr1_172316842_G_A',
       'chr1_171934790_G_A', 'chr1_172632057_A_G', 'chr1_172627498_C_T',
       'chr1_172464519_T_G', 'chr1_172328767_T_TA_PRS313_Unknown',
       'chr1_121280485_A_G', 'chr1_121137155_A_G'],
      dtype='object', length=1132)

In [None]:
'chr1_88177403_G_A', 'chr1_88127152_T_C', 'chr1_88208135_G_A',
       'chr1_88109828_G_A', 'chr1_88086894_C_A', 'chr1_88091734_C_T',
       'chr1_88116467_A_C', 'chr1_88220810_AC_A', 'chr1_88073752_G_T',
       'chr1_88120134_T_C',
       ...
       'chr1_172366806_