# Use LD Proxy API to find the relevant positions for each of the PRS313 SNPs

In [13]:
import requests
import pandas as pd
import os
import io


# API key
api_key = "ac16be4ad92d"

# Base URL for LDProxy API
base_url = "https://ldlink.nih.gov/LDlinkRest/ldproxy"

population = "ALL"

window = 500000

r2_threshold = 0.01

# Read the chromosome positions from the text file
with open("../positions/locations.txt", "r") as file:
    positions = file.read().strip().split("\n")

# Create a folder to store the CSV files
output_folder = "../ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each position
for position in positions:
    # Extract chromosome and position from the line
    chrom, pos = position.strip().split()
    
    # Construct the API request URL
    url = f"{base_url}?var={chrom}:{pos}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"
    
    # Send the API request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
            # Create a StringIO object from the data
            data_io = io.StringIO(response.text)

            # Read the data into a DataFrame using read_csv
            df = pd.read_csv(data_io, sep='\\t')
            
            # # Filter variants with high LD scores (e.g., R2 >= 0.8)
            # high_ld_variants = df[df["R2"].astype(float) >= 0.8]
            
            # Generate a unique filename for the CSV file
            output_file = os.path.join(output_folder, f"{chrom}_{pos}.csv")
            
            # Save the high LD variants to a CSV file
            df.to_csv(output_file, index=False)
            
            print(f"Saved high LD variants for {chrom}:{pos} to {output_file}")
    else:
        print(f"Failed to retrieve data for {chrom}:{pos}. Status code: {response.status_code}")

KeyboardInterrupt: 

In [None]:
chrom, pos = positions[0].strip().split()

# Construct the API request URL
url = f"{base_url}?var={chrom}:{pos}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"

# Send the API request
response = requests.get(url)

In [None]:


# Remove the leading single quote
data = response.text

# Create a StringIO object from the data
data_io = io.StringIO(data)

# Read the data into a DataFrame using read_csv
df = pd.read_csv(data_io, sep='\\t')
df

  df = pd.read_csv(data_io, sep='\\t')


Unnamed: 0,RS_Number,Coord,Alleles,MAF,Distance,Dprime,R2,Correlated_Alleles,FORGEdb,RegulomeDB,Function
0,rs612683,chr1:100880328,(A/T),0.4004,0,1.0000,1.0000,"A=A,T=T",9.0,2b,
1,rs12568038,chr1:100879914,(C/T),0.3415,-414,0.8908,0.6162,"A=C,T=T",6.0,7,
2,rs12564838,chr1:100883188,(A/G),0.3496,2860,0.8609,0.5968,"A=A,T=G",6.0,5,
3,rs7517704,chr1:100885419,(A/G),0.3494,5091,0.8609,0.5962,"A=A,T=G",6.0,6,
4,rs12760924,chr1:100898600,(A/T),0.3546,18272,0.8357,0.5748,"A=A,T=T",6.0,6,
...,...,...,...,...,...,...,...,...,...,...,...
1214,rs111334083,chr1:100617033,(G/A),0.0212,-263295,0.8351,0.0101,,7.0,5,
1215,rs141366089,chr1:100950426,(-/T),0.0575,70098,0.4970,0.0101,,,.,
1216,rs10493930,chr1:101079706,(T/C),0.0833,199378,0.4070,0.0100,,6.0,5,
1217,rs57411954,chr1:100712556,(AC/-),0.0276,-167772,0.7285,0.0100,,,.,


# Find overlap b/w 23AndMe and LD


In [None]:
# Separate files in one directory into multiple folders based on chromosome number. The files are named chromosomeNumber_chromosomePosition.csv

directory = "../ld_variants"

# Create a folder to store the CSV files
output_folder = "../ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each file in the directory

for filename in os.listdir(directory):
    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Extract the chromosome number and position from the filename
        chrom, pos = filename.split("_")
        pos = pos.split(".")[0]
        
        # Create a folder for the chromosome number
        chrom_folder = os.path.join(output_folder, "chr" + chrom)
        os.makedirs(chrom_folder, exist_ok=True)
        
        # Move the file to the chromosome folder
        src = os.path.join(directory, filename)
        dest = os.path.join(chrom_folder, filename)
        os.rename(src, dest)
        
        print(f"Moved {filename} to {chrom_folder}")

Moved 6_152432902.csv to ../ld_variants/chr6
Moved 5_79180995.csv to ../ld_variants/chr5
Moved 7_25569548.csv to ../ld_variants/chr7
Moved 6_152055978.csv to ../ld_variants/chr6
Moved 5_44508264.csv to ../ld_variants/chr5
Moved 12_14413931.csv to ../ld_variants/chr12
Moved 4_106069013.csv to ../ld_variants/chr4
Moved 3_55970777.csv to ../ld_variants/chr3
Moved 3_59373745.csv to ../ld_variants/chr3
Moved 1_88156923.csv to ../ld_variants/chr1
Moved 6_16399557.csv to ../ld_variants/chr6
Moved 12_115796577.csv to ../ld_variants/chr12
Moved 5_131640536.csv to ../ld_variants/chr5
Moved 2_218292158.csv to ../ld_variants/chr2
Moved 10_38523626.csv to ../ld_variants/chr10
Moved 1_120257110.csv to ../ld_variants/chr1
Moved 8_106358620.csv to ../ld_variants/chr8
Moved 22_46283297.csv to ../ld_variants/chr22
Moved 3_141112859.csv to ../ld_variants/chr3
Moved 16_6963972.csv to ../ld_variants/chr16
Moved 5_71965007.csv to ../ld_variants/chr5
Moved 8_143669254.csv to ../ld_variants/chr8
Moved 22_3934

In [None]:
import os
import pandas as pd
import re

def extract_coord_alleles(col_name):
    match = re.search(r'chr(\d+)_(\d+)_([ACGT])_([ACGT])', col_name)
    if match:
        chr_num, position, ref, alt = match.groups()
        return f'chr{chr_num}:{position}', f'({ref}/{alt})'
    return None, None

# Create an empty list to store the matching variants across all chromosomes
matching_variants_all = []

# Iterate over each chromosome folder
for chrom_folder in os.listdir(output_folder):
    # Check if the folder is a chromosome folder
    if chrom_folder.startswith("chr"):
        # Extract the chromosome number
        chrom = chrom_folder[3:]
        files_folder = os.path.join(output_folder, chrom_folder)
        
        # Load the training data for the chromosome
        training_data = pd.read_parquet(f"../Final_training_data/23AndMe_PRS313_merged_chr{chrom}.parquet")
        
        # Create empty lists to store the matching columns and variants for the chromosome
        matching_columns = []
        matching_variants_chrom = []
        
        # Read all the files in the folder
        for filename in os.listdir(files_folder):
            # Check if the file is a CSV file
            if filename.endswith(".csv"):
                # Load the variants for the file
                variants = pd.read_csv(os.path.join(files_folder, filename))
                
                # Find the matching columns in the training data
                for column in training_data.columns:
                    coord, alleles = extract_coord_alleles(column)
                    if coord is not None and alleles is not None:
                        mask = (variants['Coord'] == coord) & (variants['Alleles'] == alleles)
                        if mask.any():
                            matching_columns.append(column)
                            matching_variants_chrom.append(variants[mask])
                            print(f"Found matching variant for {variants[mask]}")
                

        # Get the matching columns from the training data
        matching_data = training_data[matching_columns]
        
        # Save the matching data for the chromosome
        matching_data.to_parquet(f"../Final_training_data/23AndMe_PRS313_merged_chr{chrom}_matching.parquet")
        
        # Concatenate the matching variants for the chromosome into a single dataframe
        matching_variants_chrom_df = pd.concat(matching_variants_chrom, ignore_index=True)
        
        # Append the matching variants for the chromosome to the overall list
        matching_variants_all.append(matching_variants_chrom_df)

        print(f"Processed chromosome {chrom}")
        print(f"Found {len(matching_columns)} matching columns")
        print(f"Found {len(matching_variants_chrom_df)} matching variants")

# Concatenate the matching variants from all chromosomes into a single dataframe
matching_variants_all_df = pd.concat(matching_variants_all, ignore_index=True)

# Save the dataframe with matching variants across all chromosomes
matching_variants_all_df.to_csv("../Final_training_data/23AndMe_matching_variants.csv", index=False)

KeyboardInterrupt: 