# Use LD Proxy API to find the relevant positions for each of the PRS313 SNPs

In [1]:
import requests
import pandas as pd
import os
import io


# API key
api_key = "ac16be4ad92d"

# Base URL for LDProxy API
base_url = "https://ldlink.nih.gov/LDlinkRest/ldproxy"

population = "ALL"

window = 1000000

r2_threshold = 0.01

# Read the chromosome positions from the text file
PRS313_LD = pd.read_excel("../../../Data/PRS313_with_23andMe.xlsx")


In [3]:
# Create a folder to store the CSV files
output_folder = "../../../Data/ld_variants"
os.makedirs(output_folder, exist_ok=True)

# Iterate over each position
for index, sample in PRS313_LD.iterrows():
    # if sample["in_23andMe"] == True:
    #      continue

    # Extract chromosome and position from the line
    chrom = sample["Chromosome"]
    pos = sample["Positionb"]
    
    # Construct the API request URL
    url = f"{base_url}?var={chrom}:{pos}&pop={population}&r2_d=r2&window={window}&genome_build=grch37&token={api_key}"
    
    # Send the API request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
            # Create a StringIO object from the data
            data_io = io.StringIO(response.text)

            # Read the data into a DataFrame using read_csv
            df = pd.read_csv(data_io, sep='\\t')
            
            # # Filter variants with high LD scores (e.g., R2 >= 0.8)
            # high_ld_variants = df[df["R2"].astype(float) >= 0.8]
            
            # Generate a unique filename for the CSV file
            output_file = os.path.join(output_folder, f"{chrom}_{pos}.csv")
            
            # Save the high LD variants to a CSV file
            df.to_csv(output_file, index=False)
            
            print(f"Saved high LD variants for {chrom}:{pos} to {output_file}")
    else:
        print(f"Failed to retrieve data for {chrom}:{pos}. Status code: {response.status_code}")

  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:100880328 to ../../../Data/ld_variants/1_100880328.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:10566215 to ../../../Data/ld_variants/1_10566215.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:110198129 to ../../../Data/ld_variants/1_110198129.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:114445880 to ../../../Data/ld_variants/1_114445880.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:118141492 to ../../../Data/ld_variants/1_118141492.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:120257110 to ../../../Data/ld_variants/1_120257110.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:121280613 to ../../../Data/ld_variants/1_121280613.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:121287994 to ../../../Data/ld_variants/1_121287994.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:145604302 to ../../../Data/ld_variants/1_145604302.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:149906413 to ../../../Data/ld_variants/1_149906413.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:155556971 to ../../../Data/ld_variants/1_155556971.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:168171052 to ../../../Data/ld_variants/1_168171052.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:172328767 to ../../../Data/ld_variants/1_172328767.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:18807339 to ../../../Data/ld_variants/1_18807339.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:201437832 to ../../../Data/ld_variants/1_201437832.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:202184600 to ../../../Data/ld_variants/1_202184600.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:203770448 to ../../../Data/ld_variants/1_203770448.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:204502514 to ../../../Data/ld_variants/1_204502514.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:208076291 to ../../../Data/ld_variants/1_208076291.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:217053815 to ../../../Data/ld_variants/1_217053815.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:217220574 to ../../../Data/ld_variants/1_217220574.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:220671050 to ../../../Data/ld_variants/1_220671050.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:242034263 to ../../../Data/ld_variants/1_242034263.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:41380440 to ../../../Data/ld_variants/1_41380440.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:41389220 to ../../../Data/ld_variants/1_41389220.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:46670206 to ../../../Data/ld_variants/1_46670206.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:51467096 to ../../../Data/ld_variants/1_51467096.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:7917076 to ../../../Data/ld_variants/1_7917076.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:88156923 to ../../../Data/ld_variants/1_88156923.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 1:88428199 to ../../../Data/ld_variants/1_88428199.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:10138983 to ../../../Data/ld_variants/2_10138983.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:121058254 to ../../../Data/ld_variants/2_121058254.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:121089731 to ../../../Data/ld_variants/2_121089731.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:121159205 to ../../../Data/ld_variants/2_121159205.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:121246568 to ../../../Data/ld_variants/2_121246568.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:172974566 to ../../../Data/ld_variants/2_172974566.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:174212910 to ../../../Data/ld_variants/2_174212910.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:192381934 to ../../../Data/ld_variants/2_192381934.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:19315675 to ../../../Data/ld_variants/2_19315675.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:202204741 to ../../../Data/ld_variants/2_202204741.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:217920769 to ../../../Data/ld_variants/2_217920769.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:217955896 to ../../../Data/ld_variants/2_217955896.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:218292158 to ../../../Data/ld_variants/2_218292158.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:218714845 to ../../../Data/ld_variants/2_218714845.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:241388857 to ../../../Data/ld_variants/2_241388857.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:25129473 to ../../../Data/ld_variants/2_25129473.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:29179452 to ../../../Data/ld_variants/2_29179452.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:29615233 to ../../../Data/ld_variants/2_29615233.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:39699510 to ../../../Data/ld_variants/2_39699510.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:70172587 to ../../../Data/ld_variants/2_70172587.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 2:88358825 to ../../../Data/ld_variants/2_88358825.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:141112859 to ../../../Data/ld_variants/3_141112859.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:172285237 to ../../../Data/ld_variants/3_172285237.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:189774456 to ../../../Data/ld_variants/3_189774456.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:27353716 to ../../../Data/ld_variants/3_27353716.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:27388664 to ../../../Data/ld_variants/3_27388664.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:29294845 to ../../../Data/ld_variants/3_29294845.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:30684907 to ../../../Data/ld_variants/3_30684907.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:46888198 to ../../../Data/ld_variants/3_46888198.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:4742251 to ../../../Data/ld_variants/3_4742251.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:49709912 to ../../../Data/ld_variants/3_49709912.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:55970777 to ../../../Data/ld_variants/3_55970777.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:59373745 to ../../../Data/ld_variants/3_59373745.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:63887449 to ../../../Data/ld_variants/3_63887449.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:71620370 to ../../../Data/ld_variants/3_71620370.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:87037543 to ../../../Data/ld_variants/3_87037543.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 3:99403877 to ../../../Data/ld_variants/3_99403877.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:106069013 to ../../../Data/ld_variants/4_106069013.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:126752992 to ../../../Data/ld_variants/4_126752992.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:143467195 to ../../../Data/ld_variants/4_143467195.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:151218296 to ../../../Data/ld_variants/4_151218296.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:175842495 to ../../../Data/ld_variants/4_175842495.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:175847436 to ../../../Data/ld_variants/4_175847436.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:187503758 to ../../../Data/ld_variants/4_187503758.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:38784633 to ../../../Data/ld_variants/4_38784633.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:84370124 to ../../../Data/ld_variants/4_84370124.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:89240476 to ../../../Data/ld_variants/4_89240476.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 4:92594859 to ../../../Data/ld_variants/4_92594859.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:104300273 to ../../../Data/ld_variants/5_104300273.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:122478676 to ../../../Data/ld_variants/5_122478676.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:122705244 to ../../../Data/ld_variants/5_122705244.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:1279790 to ../../../Data/ld_variants/5_1279790.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:1296255 to ../../../Data/ld_variants/5_1296255.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:131640536 to ../../../Data/ld_variants/5_131640536.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:132407058 to ../../../Data/ld_variants/5_132407058.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:1353077 to ../../../Data/ld_variants/5_1353077.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:158244083 to ../../../Data/ld_variants/5_158244083.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:16231194 to ../../../Data/ld_variants/5_16231194.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:169591460 to ../../../Data/ld_variants/5_169591460.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:173358154 to ../../../Data/ld_variants/5_173358154.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:176134882 to ../../../Data/ld_variants/5_176134882.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:2777029 to ../../../Data/ld_variants/5_2777029.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:32579616 to ../../../Data/ld_variants/5_32579616.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:345109 to ../../../Data/ld_variants/5_345109.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:44508264 to ../../../Data/ld_variants/5_44508264.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:44619502 to ../../../Data/ld_variants/5_44619502.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:44649944 to ../../../Data/ld_variants/5_44649944.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:44706498 to ../../../Data/ld_variants/5_44706498.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:44853593 to ../../../Data/ld_variants/5_44853593.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:52679539 to ../../../Data/ld_variants/5_52679539.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:55662540 to ../../../Data/ld_variants/5_55662540.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:55965167 to ../../../Data/ld_variants/5_55965167.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:56023083 to ../../../Data/ld_variants/5_56023083.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:56042972 to ../../../Data/ld_variants/5_56042972.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:56045081 to ../../../Data/ld_variants/5_56045081.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:58241712 to ../../../Data/ld_variants/5_58241712.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:71965007 to ../../../Data/ld_variants/5_71965007.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:73234583 to ../../../Data/ld_variants/5_73234583.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:77155397 to ../../../Data/ld_variants/5_77155397.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:79180995 to ../../../Data/ld_variants/5_79180995.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:81512947 to ../../../Data/ld_variants/5_81512947.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 5:90789470 to ../../../Data/ld_variants/5_90789470.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:130341728 to ../../../Data/ld_variants/6_130341728.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:13713366 to ../../../Data/ld_variants/6_13713366.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:149595505 to ../../../Data/ld_variants/6_149595505.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:151949806 to ../../../Data/ld_variants/6_151949806.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:151955914 to ../../../Data/ld_variants/6_151955914.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:152022664 to ../../../Data/ld_variants/6_152022664.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:152023191 to ../../../Data/ld_variants/6_152023191.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:152055978 to ../../../Data/ld_variants/6_152055978.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:152432902 to ../../../Data/ld_variants/6_152432902.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:16399557 to ../../../Data/ld_variants/6_16399557.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:169006947 to ../../../Data/ld_variants/6_169006947.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:170332621 to ../../../Data/ld_variants/6_170332621.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:18783140 to ../../../Data/ld_variants/6_18783140.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:20537845 to ../../../Data/ld_variants/6_20537845.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:21923810 to ../../../Data/ld_variants/6_21923810.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:27425644 to ../../../Data/ld_variants/6_27425644.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:43227141 to ../../../Data/ld_variants/6_43227141.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:82263549 to ../../../Data/ld_variants/6_82263549.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:85912194 to ../../../Data/ld_variants/6_85912194.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 6:87803819 to ../../../Data/ld_variants/6_87803819.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:101552440 to ../../../Data/ld_variants/7_101552440.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:102481842 to ../../../Data/ld_variants/7_102481842.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:130656911 to ../../../Data/ld_variants/7_130656911.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:130674481 to ../../../Data/ld_variants/7_130674481.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:139943702 to ../../../Data/ld_variants/7_139943702.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:144048902 to ../../../Data/ld_variants/7_144048902.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:21940960 to ../../../Data/ld_variants/7_21940960.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:25569548 to ../../../Data/ld_variants/7_25569548.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:28869017 to ../../../Data/ld_variants/7_28869017.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:55192256 to ../../../Data/ld_variants/7_55192256.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:91459189 to ../../../Data/ld_variants/7_91459189.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:94113799 to ../../../Data/ld_variants/7_94113799.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:98005235 to ../../../Data/ld_variants/7_98005235.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 7:99948655 to ../../../Data/ld_variants/7_99948655.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:102483100 to ../../../Data/ld_variants/8_102483100.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:106358620 to ../../../Data/ld_variants/8_106358620.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:117209548 to ../../../Data/ld_variants/8_117209548.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:120862186 to ../../../Data/ld_variants/8_120862186.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:124563705 to ../../../Data/ld_variants/8_124563705.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:124571581 to ../../../Data/ld_variants/8_124571581.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:124739913 to ../../../Data/ld_variants/8_124739913.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:128213561 to ../../../Data/ld_variants/8_128213561.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:128370949 to ../../../Data/ld_variants/8_128370949.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:128372172 to ../../../Data/ld_variants/8_128372172.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:129199566 to ../../../Data/ld_variants/8_129199566.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:143669254 to ../../../Data/ld_variants/8_143669254.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:170692 to ../../../Data/ld_variants/8_170692.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:17787610 to ../../../Data/ld_variants/8_17787610.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:23447496 to ../../../Data/ld_variants/8_23447496.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:23663653 to ../../../Data/ld_variants/8_23663653.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:29509616 to ../../../Data/ld_variants/8_29509616.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:36858483 to ../../../Data/ld_variants/8_36858483.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:76230943 to ../../../Data/ld_variants/8_76230943.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:76333056 to ../../../Data/ld_variants/8_76333056.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 8:76378165 to ../../../Data/ld_variants/8_76378165.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:110303808 to ../../../Data/ld_variants/9_110303808.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:110837073 to ../../../Data/ld_variants/9_110837073.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:110837176 to ../../../Data/ld_variants/9_110837176.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:110849525 to ../../../Data/ld_variants/9_110849525.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:110885479 to ../../../Data/ld_variants/9_110885479.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:119313486 to ../../../Data/ld_variants/9_119313486.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:129424719 to ../../../Data/ld_variants/9_129424719.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:136146597 to ../../../Data/ld_variants/9_136146597.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:21964882 to ../../../Data/ld_variants/9_21964882.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:22041998 to ../../../Data/ld_variants/9_22041998.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:36928288 to ../../../Data/ld_variants/9_36928288.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:6880263 to ../../../Data/ld_variants/9_6880263.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:87782211 to ../../../Data/ld_variants/9_87782211.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 9:98362587 to ../../../Data/ld_variants/9_98362587.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:114777670 to ../../../Data/ld_variants/10_114777670.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:115128491 to ../../../Data/ld_variants/10_115128491.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:123095209 to ../../../Data/ld_variants/10_123095209.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:123340107 to ../../../Data/ld_variants/10_123340107.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:123340431 to ../../../Data/ld_variants/10_123340431.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:123349324 to ../../../Data/ld_variants/10_123349324.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:13892298 to ../../../Data/ld_variants/10_13892298.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:22032942 to ../../../Data/ld_variants/10_22032942.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:22477776 to ../../../Data/ld_variants/10_22477776.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:22861490 to ../../../Data/ld_variants/10_22861490.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:38523626 to ../../../Data/ld_variants/10_38523626.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:5794652 to ../../../Data/ld_variants/10_5794652.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:64299890 to ../../../Data/ld_variants/10_64299890.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:64819996 to ../../../Data/ld_variants/10_64819996.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:71335574 to ../../../Data/ld_variants/10_71335574.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:80851257 to ../../../Data/ld_variants/10_80851257.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:80886726 to ../../../Data/ld_variants/10_80886726.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 10:95292187 to ../../../Data/ld_variants/10_95292187.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:103614438 to ../../../Data/ld_variants/11_103614438.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:108267402 to ../../../Data/ld_variants/11_108267402.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:111696440 to ../../../Data/ld_variants/11_111696440.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:116727936 to ../../../Data/ld_variants/11_116727936.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:122966626 to ../../../Data/ld_variants/11_122966626.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:129243417 to ../../../Data/ld_variants/11_129243417.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:129461016 to ../../../Data/ld_variants/11_129461016.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:18664241 to ../../../Data/ld_variants/11_18664241.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:1895708 to ../../../Data/ld_variants/11_1895708.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:42844441 to ../../../Data/ld_variants/11_42844441.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:433617 to ../../../Data/ld_variants/11_433617.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:44368892 to ../../../Data/ld_variants/11_44368892.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:46318032 to ../../../Data/ld_variants/11_46318032.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:65553492 to ../../../Data/ld_variants/11_65553492.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:65572431 to ../../../Data/ld_variants/11_65572431.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:69328130 to ../../../Data/ld_variants/11_69328130.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:69330983 to ../../../Data/ld_variants/11_69330983.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:69331418 to ../../../Data/ld_variants/11_69331418.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 11:803017 to ../../../Data/ld_variants/11_803017.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:103097887 to ../../../Data/ld_variants/12_103097887.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:111600134 to ../../../Data/ld_variants/12_111600134.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:115108136 to ../../../Data/ld_variants/12_115108136.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:115796577 to ../../../Data/ld_variants/12_115796577.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:115835836 to ../../../Data/ld_variants/12_115835836.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:120832146 to ../../../Data/ld_variants/12_120832146.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:14413931 to ../../../Data/ld_variants/12_14413931.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:28149568 to ../../../Data/ld_variants/12_28149568.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:28174817 to ../../../Data/ld_variants/12_28174817.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:28347382 to ../../../Data/ld_variants/12_28347382.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:29140260 to ../../../Data/ld_variants/12_29140260.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:293626 to ../../../Data/ld_variants/12_293626.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:57146069 to ../../../Data/ld_variants/12_57146069.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:70798355 to ../../../Data/ld_variants/12_70798355.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:83064195 to ../../../Data/ld_variants/12_83064195.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:85004551 to ../../../Data/ld_variants/12_85004551.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 12:96027759 to ../../../Data/ld_variants/12_96027759.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 13:32839990 to ../../../Data/ld_variants/13_32839990.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 13:32972626 to ../../../Data/ld_variants/13_32972626.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 13:43501356 to ../../../Data/ld_variants/13_43501356.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 13:73806982 to ../../../Data/ld_variants/13_73806982.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 13:73960952 to ../../../Data/ld_variants/13_73960952.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:105213978 to ../../../Data/ld_variants/14_105213978.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:37128564 to ../../../Data/ld_variants/14_37128564.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:37228504 to ../../../Data/ld_variants/14_37228504.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:68660428 to ../../../Data/ld_variants/14_68660428.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:68979835 to ../../../Data/ld_variants/14_68979835.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:91751788 to ../../../Data/ld_variants/14_91751788.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:91841069 to ../../../Data/ld_variants/14_91841069.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 14:93070286 to ../../../Data/ld_variants/14_93070286.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:100905819 to ../../../Data/ld_variants/15_100905819.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:46680811 to ../../../Data/ld_variants/15_46680811.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:50694306 to ../../../Data/ld_variants/15_50694306.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:66630569 to ../../../Data/ld_variants/15_66630569.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:67457698 to ../../../Data/ld_variants/15_67457698.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:75750383 to ../../../Data/ld_variants/15_75750383.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 15:91512267 to ../../../Data/ld_variants/15_91512267.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:10706580 to ../../../Data/ld_variants/16_10706580.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:23007047 to ../../../Data/ld_variants/16_23007047.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:4008542 to ../../../Data/ld_variants/16_4008542.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:4106788 to ../../../Data/ld_variants/16_4106788.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:52538825 to ../../../Data/ld_variants/16_52538825.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:52599188 to ../../../Data/ld_variants/16_52599188.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:53809123 to ../../../Data/ld_variants/16_53809123.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:53861139 to ../../../Data/ld_variants/16_53861139.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:53861592 to ../../../Data/ld_variants/16_53861592.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:54682064 to ../../../Data/ld_variants/16_54682064.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:6963972 to ../../../Data/ld_variants/16_6963972.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:80648296 to ../../../Data/ld_variants/16_80648296.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:85145977 to ../../../Data/ld_variants/16_85145977.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 16:87086492 to ../../../Data/ld_variants/16_87086492.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:29168077 to ../../../Data/ld_variants/17_29168077.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:39251123 to ../../../Data/ld_variants/17_39251123.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:40127060 to ../../../Data/ld_variants/17_40127060.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:40485239 to ../../../Data/ld_variants/17_40485239.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:40744470 to ../../../Data/ld_variants/17_40744470.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:43212339 to ../../../Data/ld_variants/17_43212339.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:44283858 to ../../../Data/ld_variants/17_44283858.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:53209774 to ../../../Data/ld_variants/17_53209774.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 17:77781725 to ../../../Data/ld_variants/17_77781725.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:11696613 to ../../../Data/ld_variants/18_11696613.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:20634253 to ../../../Data/ld_variants/18_20634253.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:24125857 to ../../../Data/ld_variants/18_24125857.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:24337424 to ../../../Data/ld_variants/18_24337424.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:24518050 to ../../../Data/ld_variants/18_24518050.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:25407513 to ../../../Data/ld_variants/18_25407513.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:29981526 to ../../../Data/ld_variants/18_29981526.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:42411803 to ../../../Data/ld_variants/18_42411803.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 18:42888797 to ../../../Data/ld_variants/18_42888797.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:13249921 to ../../../Data/ld_variants/19_13249921.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:17393925 to ../../../Data/ld_variants/19_17393925.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:18569492 to ../../../Data/ld_variants/19_18569492.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:19517054 to ../../../Data/ld_variants/19_19517054.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:44283031 to ../../../Data/ld_variants/19_44283031.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:46166073 to ../../../Data/ld_variants/19_46166073.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 19:55816678 to ../../../Data/ld_variants/19_55816678.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 20:11379842 to ../../../Data/ld_variants/20_11379842.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 20:41613706 to ../../../Data/ld_variants/20_41613706.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 20:52296849 to ../../../Data/ld_variants/20_52296849.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 20:5948227 to ../../../Data/ld_variants/20_5948227.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 21:16364756 to ../../../Data/ld_variants/21_16364756.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 21:16566350 to ../../../Data/ld_variants/21_16566350.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 21:16574455 to ../../../Data/ld_variants/21_16574455.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 21:47762932 to ../../../Data/ld_variants/21_47762932.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:19766137 to ../../../Data/ld_variants/22_19766137.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:29121087 to ../../../Data/ld_variants/22_29121087.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:29135543 to ../../../Data/ld_variants/22_29135543.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:29203724 to ../../../Data/ld_variants/22_29203724.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:29551872 to ../../../Data/ld_variants/22_29551872.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:38583315 to ../../../Data/ld_variants/22_38583315.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:39343916 to ../../../Data/ld_variants/22_39343916.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:40904707 to ../../../Data/ld_variants/22_40904707.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:43433100 to ../../../Data/ld_variants/22_43433100.csv


  df = pd.read_csv(data_io, sep='\\t')


Saved high LD variants for 22:45319953 to ../../../Data/ld_variants/22_45319953.csv
Saved high LD variants for 22:46283297 to ../../../Data/ld_variants/22_46283297.csv


  df = pd.read_csv(data_io, sep='\\t')


In [6]:
import os
import shutil

# Base folder where the CSV files are currently stored
base_folder = "../../../Data/ld_variants"

# Get a list of all CSV files in the base folder
csv_files = [f for f in os.listdir(base_folder) if f.endswith('.csv')]

# Iterate over each CSV file
for file in csv_files:
    # Extract the chromosome number from the file name
    chrom = file.split('_')[0]
    
    # Create a folder for the current chromosome if it doesn't exist
    chrom_folder = os.path.join(base_folder, f"chr{chrom}")
    os.makedirs(chrom_folder, exist_ok=True)
    
    # Move the file to the corresponding chromosome folder
    source_file = os.path.join(base_folder, file)
    destination_file = os.path.join(chrom_folder, file)
    shutil.move(source_file, destination_file)
    
    print(f"Moved {file} to {chrom_folder}")


Moved 6_152432902.csv to ../../../Data/ld_variants/chr6
Moved 5_79180995.csv to ../../../Data/ld_variants/chr5
Moved 7_25569548.csv to ../../../Data/ld_variants/chr7
Moved 6_152055978.csv to ../../../Data/ld_variants/chr6
Moved 5_44508264.csv to ../../../Data/ld_variants/chr5
Moved 12_14413931.csv to ../../../Data/ld_variants/chr12
Moved 4_106069013.csv to ../../../Data/ld_variants/chr4
Moved 3_55970777.csv to ../../../Data/ld_variants/chr3
Moved 3_59373745.csv to ../../../Data/ld_variants/chr3
Moved 1_88156923.csv to ../../../Data/ld_variants/chr1
Moved 6_16399557.csv to ../../../Data/ld_variants/chr6
Moved 12_115796577.csv to ../../../Data/ld_variants/chr12
Moved 5_131640536.csv to ../../../Data/ld_variants/chr5
Moved 2_218292158.csv to ../../../Data/ld_variants/chr2
Moved 10_38523626.csv to ../../../Data/ld_variants/chr10
Moved 1_120257110.csv to ../../../Data/ld_variants/chr1
Moved 8_106358620.csv to ../../../Data/ld_variants/chr8
Moved 22_46283297.csv to ../../../Data/ld_variants/

# Find overlap b/w 23AndMe and LD


In [3]:
import os
import pandas as pd
import re

directory = "../../../Data/ld_variants"

# Create a folder to store the CSV files
output_folder = "../../../Data/ld_variants"
os.makedirs(output_folder, exist_ok=True)


def extract_coord_alleles(col_name):
    match = re.search(r'chr(\d+)_(\d+)_([ACGT]+)_([ACGT,]+)', col_name)
    if match:
        chr_num, position, ref_allele, alt_alleles = match.groups()
        return f'chr{chr_num}:{position}', f'({ref_allele}/{alt_alleles})'
    return None, None

# Create an empty list to store the matching variants across all chromosomes
matching_variants_all = []

training_data_folder = "../../../Data/Raw_training_data_23andMe_union/"
folder = "../../../Data/Data/Filtered_raw_training_data_union/23AndMe_PRS313_merged_chr1_matching.parquet/"


# Iterate over each chromosome folder
for chrom_folder in os.listdir(output_folder):
    # Check if the folder is a chromosome folder
    if chrom_folder.startswith("chr"):
        # Extract the chromosome number
        chrom = chrom_folder[3:]

        files_folder = os.path.join(output_folder, chrom_folder)
        
        # Load the training data for the chromosome
        training_data = pd.read_parquet(f"{training_data_folder}23AndMe_PRS313_merged_chr{chrom}.parquet")
        
        # Create empty lists to store the matching columns and variants for the chromosome
        matching_columns = []
        matching_variants_chrom = []
        not_found_snps_chrom = []

        # Read all the files in the folder
        for filename in os.listdir(files_folder):

            # Check if the file is a CSV file
            if filename.endswith(".csv"):

                # Check if the position is an unknown position or a known position 
                position = filename.split('.')[0]

                # Find the column name with the position
                position_column = training_data.columns[training_data.columns.str.contains(position)]

                # if (position_column[0].split("_")[-1] == "Known"):
                #     # If the position is a known position, skip the file, because we don't need LD proxies for known positions
                #     print(f"Skipping LD for {filename} because it is a known position. Column added to training data")

                #     if position_column[0] not in matching_columns:

                #         # Add the matching column to the list
                #         matching_columns.append(position_column[0])

                #     continue


                # Load the variants for the file
                variants = pd.read_csv(os.path.join(files_folder, filename))
                
                # Find the matching columns in the training data
                for column in training_data.columns:
                    coord, alleles = extract_coord_alleles(column)
                    if coord is not None and alleles is not None:
                        try:
                            # Got rid of allele matching, because of coding differences. GC--> C (in PRS313) is coded as (C/-) in dbSNP and LDProxy
                            mask = (variants['Coord'] == coord)
                            # mask = (variants['Coord'] == coord) & (variants['Alleles'] == alleles)
                                
                            if mask.any():
                                # Only append column if it has not been added before
                                if column not in matching_columns:

                                    # Check column added is an unknown position
                                    if (column.split("_")[-1] == "Unknown"):
                                        print(column)


                                    matching_columns.append(column)
                                    matching_variants_chrom.append(variants[mask])
                        except KeyError:
                            error_position = filename.split('.')[0]
                            not_found_snps_chrom.append(error_position)
                            print(f"SNP {error_position} not found in dbSNP and cannot be proxied using LDProxy")

                            # Find the columns in training_data.columns with positions within +/- 500K BP of the error_position
                            counter_error_added = 0
                            for col in training_data.columns:
                                coord, _ = extract_coord_alleles(col)
                                if coord is not None:
                                    col_position = int(coord.split(':')[1])
                                    error_bp = int(error_position.split('_')[1])
                                    if abs(col_position - error_bp) <= 500000 and col not in matching_columns:
                                        counter_error_added += 1
                                        matching_columns.append(col)

                            print(f"Added {counter_error_added} columns to matching_columns for missing data position: {error_position}")
                            
                            break

                print(f"Processed {filename}")
                print(f"Found {len(matching_columns)} matching columns")
                

        # Get the matching columns from the training data
        matching_data = training_data[matching_columns]

        os.makedirs(folder, exist_ok=True)
        save_path = f"{folder}/23AndMe_PRS313_merged_chr{chrom}_matching.parquet"
        # Save the matching data for the chromosome
        matching_data.to_parquet(save_path)

        # Concatenate the matching variants for the chromosome into a single dataframe
        matching_variants_chrom_df = pd.concat(matching_variants_chrom, ignore_index=True)
        
        # Append the matching variants for the chromosome to the overall list
        matching_variants_all.append(matching_variants_chrom_df)

        print(f"Saved to file {save_path}")
        print(f"Found {len(matching_columns)} matching columns")
        print(f"Found {len(matching_variants_chrom_df)} matching variants")

# Concatenate the matching variants from all chromosomes into a single dataframe
matching_variants_all_df = pd.concat(matching_variants_all, ignore_index=True)

# Save the dataframe with matching variants across all chromosomes
matching_variants_all_df.to_csv(f"{folder}/23AndMe_matching_variants.csv", index=False)

KeyboardInterrupt: 

# Save Reference and Alternate allele information to the file

In [4]:

matching_variants_all_df = pd.read_csv(f"{folder}/23AndMe_matching_variants.csv")

In [5]:
matching_variants_all_df

Unnamed: 0,RS_Number,Coord,Alleles,MAF,Distance,Dprime,R2,Correlated_Alleles,FORGEdb,RegulomeDB,Function
0,rs10845923,chr12:14368479,(T/C),0.3946,-45452,0.4107,0.1306,"G=C,C=T",6.0,7,
1,rs73056715,chr12:14417744,(G/A),0.1168,3813,0.9898,0.0654,,10.0,3a,
2,rs74069412,chr12:14275947,(T/G),0.1745,-137984,0.1618,0.0110,,5.0,5,
3,rs17834211,chr12:14402358,(T/G),0.3215,-11573,0.3275,0.0256,,6.0,4,
4,rs75031954,chr12:14431168,(C/A),0.0286,17237,0.6108,0.0217,,6.0,5,
...,...,...,...,...,...,...,...,...,...,...,...
13969,rs115461551,chr1:120690218,(C/T),0.1078,-597776,0.3968,0.1165,"A=T,G=C",6.0,7,
13970,rs115706173,chr1:121280485,(A/G),0.0048,-7509,1.0000,0.0295,,8.0,4,
13971,rs79368528,chr1:120689085,(C/T),0.0871,-598909,0.5304,0.1643,"A=T,G=C",4.0,5,
13972,rs587774691,chr1:121137155,(A/G),0.0028,-150839,1.0000,0.0172,,,.,


In [9]:
import os
import csv

# Directory containing the VCF files
vcf_directory = "../../../Data/23AndMePositionsUnion"

# Read in the 23andMe_matching_variants.csv file
with open(f"{folder}/23andMe_matching_variants.csv", "r") as csv_file:
    reader = csv.DictReader(csv_file)
    data = list(reader)

# Process each row in the CSV
for row in data:
    # Extract the chromosome and position from the Coord column
    chrom, pos = row["Coord"].split(":")
    pos = pos.replace(",", "")  # Remove any commas from the position
    
    # Construct the path to the corresponding VCF file
    vcf_file = os.path.join(vcf_directory, chrom, f"{chrom}_pos{pos}.vcf")
    
    # Check if the VCF file exists
    if os.path.isfile(vcf_file):
        # Read the VCF file and extract the reference and alternate alleles
        with open(vcf_file, "r") as file:
            for line in file:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    ref = fields[3]
                    alt = fields[4].split(",")  # Split alternate alleles by comma
                    break
        
        # Update the row with the reference and alternate alleles
        row["Ref"] = ref
        row["Alt"] = ",".join(alt)  # Join alternate alleles with comma
    else:
        # If the VCF file doesn't exist, set the reference and alternate alleles to empty strings
        row["Ref"] = ""
        row["Alt"] = ""

# Write the updated data to a new CSV file
with open(f"{folder}/23andMe_matching_variants_updated.csv", "w", newline="") as csv_file:
    fieldnames = data[0].keys()
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

# Optional - Create Final Training Data with +/- 500K BP Window

In [8]:
import os
import pandas as pd
import re

window_size = 250000
pattern = re.compile(r"chr\d+_(\d+)_")

# Create an empty list to store the matching variants across all chromosomes
matching_variants_all = []

for chrom in range(1, 23):
    # Load the training data for the chromosome
    training_data = pd.read_parquet(f"../../../Data/Raw_training_data/23AndMe_PRS313_merged_chr{chrom}.parquet")
    
    # Get all columns with "PRS313" in the name
    prs313_unknown_columns = [col for col in training_data.columns if "PRS313_Unknown" in col]
    prs313_unknown_positions = [int(pattern.search(col).group(1)) for col in prs313_unknown_columns]
    prs313_unknown_positions_set = set(prs313_unknown_positions)
    
    # Get all columns in training_data that contain a number within +/- 500k of the PRS313_Unknown position
    filtered_columns = [col for col in training_data.columns if any(abs(int(pattern.search(col).group(1)) - pos) <= window_size for pos in prs313_unknown_positions_set)]
    
    training_data_filtered = training_data[filtered_columns]

    print(len(filtered_columns))

    # Save the filtered training data for the chromosome
    # training_data_filtered.to_parquet(f"../../Data/500k_window_filtered_data/23AndMe_PRS313_merged_chr{chrom}_filtered.parquet")

FileNotFoundError: [Errno 2] No such file or directory: '../../Data/Raw_training_data/23AndMe_PRS313_merged_chr1.parquet'

In [16]:
chrom = 1

# training_data_window = pd.read_parquet(f"../../../Data/500k_window_filtered_data/23AndMe_PRS313_merged_chr{chrom}_filtered.parquet")
# print(training_data_window.shape)

training_data_ld_proxy = pd.read_parquet(f"../../../Data/Data/Filtered_raw_training_data_union/23AndMe_PRS313_merged_chr1_matching.parquet/23AndMe_PRS313_merged_chr{chrom}_matching.parquet")
print(training_data_ld_proxy.shape)

(2504, 2039)
