## Extract 20k window around every significant SNP

Routine to extract 20Kb windoes around every significant SNPs

In [21]:
import pandas as pd

In [22]:
# load database
df = pd.read_csv("./significant_loci.csv")

In [61]:
df[df['CHR'].duplicated(keep=False)].sort_values(by="CHR")

Unnamed: 0,SNP,CHR,BP,P,zscore,trait
24,loc17357_pos75,9534,19598845,4.038489e-07,-614.2631,abaxial_trichomes
29,loc15949_pos141,9534,2850933,8.764997e-07,-639.4628,adaxial_trichomes
7,loc17815_pos41,9666,5105659,3.366722e-11,-403.2628,blade_area
32,loc17612_pos122,9666,639163,6.948563e-06,-641.4501,adaxial_trichomes
4,loc68546_pos257,59870,8709701,1.367609e-11,-402.3808,blade_area
14,loc68029_pos67,59870,2152335,4.121656e-06,-381.3673,teeth
18,loc119342_pos200,63200,3462418,2.671646e-09,-609.3962,abaxial_trichomes
22,loc119365_pos112,63200,3809754,1.030289e-07,-612.9429,abaxial_trichomes
15,loc130774_pos89,63552,13207322,4.296808e-06,-381.4072,teeth
31,loc130375_pos101,63552,4859480,6.457438e-06,-641.3799,adaxial_trichomes


In [87]:
#checking which scaffolds contain more than one SNP
scaffold_info = {}

for idx, row in df.iterrows():
    if row["CHR"] in scaffold_info.keys(): 
        scaffold_info[row["CHR"]].append(row["BP"])
    else:
        scaffold_info[row["CHR"]] = [row["BP"]]
        
scaffold_info

{59695: [80628],
 61473: [1045000],
 17365: [7371028],
 61434: [8380992],
 59870: [8709701, 2152335],
 62359: [273498],
 62189: [1927409],
 9666: [5105659],
 19888: [5505553],
 47881: [2774374],
 63432: [2670146],
 61828: [5464180],
 24083: [9244149],
 63552: [13207322, 4859480],
 39367: [232203],
 15681: [2806641],
 63200: [3462418, 3809754],
 61248: [2151777],
 62932: [4009105],
 62050: [2398634],
 9534: [19598845, 2850933],
 23621: [2087672],
 62125: [3731290],
 60920: [7982493],
 63323: [2650799],
 4057: [1779621],
 52789: [2593286],
 61262: [4536745]}

In [110]:
# put in memory genome and perfome extraction of windows, 
# result should be in a multifasta, name will include chr_snp_trait

#set genome file, for easy process, I am using the oneliner version of the genome
genome_file = "/home/carlos/GDRIVE/viburnumThings/Viburnum-Annotation/MAKER/reducedL50/genome/final_assembly_v3_oneline.fa"
window_size = 20000

#produce output file
with open("20kwindows.fasta", "a") as file_window:

    #open genome file
    with open(genome_file, "r") as genome:

        #read line by line
        lines = genome.readlines()

        for line in lines:
            #check if current line is a header or a sequence
            if line[0] == ">":
                scaffold_number = int(line[1:].strip().split(";")[0].split("_")[1]) #get only scaffold number to match with CSV
                continue #if is the header, do another loop to get the sequence
            else:
                if scaffold_number in scaffold_info.keys():
                    for snp in scaffold_info[scaffold_number]:
                        new_header = f"Scaffold_{scaffold_number}_SNP_{snp}" #set new header with more information

                        seq = line.strip() #get the sequence

                        #define limits of window and correct them if needed
                        start_window = snp - window_size
                        end_window = snp + window_size + 1
                        if start_window < 0: start_window = 0
                        if end_window > len(seq): end_window = len(seq)

                        #get only the window around the snp
                        window = seq[start_window:end_window]

                        print(new_header)
                        print(window[0:10], len(window), snp)

                        #save every window as independent fasta in a multifasta file
                        file_window.write(f">{new_header}\n{window}\n")

Scaffold_4057_SNP_1779621
AAAATATTCA 40001 1779621
Scaffold_9534_SNP_19598845
GATTACTAAG 40001 19598845
Scaffold_9534_SNP_2850933
ACTAGCTAAT 40001 2850933
Scaffold_9666_SNP_5105659
GACGACAATG 40001 5105659
Scaffold_15681_SNP_2806641
ATTCCATAAC 40001 2806641
Scaffold_17365_SNP_7371028
gtcctcgcct 40001 7371028
Scaffold_19888_SNP_5505553
ATTATTTCCA 40001 5505553
Scaffold_23621_SNP_2087672
TTCGAAAAGT 40001 2087672
Scaffold_24083_SNP_9244149
NNNNNNNNNN 40001 9244149
Scaffold_39367_SNP_232203
GGCATGATCC 40001 232203
Scaffold_47881_SNP_2774374
TAAAAATTTG 40001 2774374
Scaffold_52789_SNP_2593286
TCCTTGATCA 40001 2593286
Scaffold_59695_SNP_80628
TTTTATATAT 24180 80628
Scaffold_59870_SNP_8709701
TATTTTTGGA 40001 8709701
Scaffold_59870_SNP_2152335
ACCATGCAGG 40001 2152335
Scaffold_60920_SNP_7982493
GGTTCTTCTT 40001 7982493
Scaffold_61248_SNP_2151777
ACACCTCACC 40001 2151777
Scaffold_61262_SNP_4536745
TAATGCATGG 40001 4536745
Scaffold_61434_SNP_8380992
GTGTCATAGA 40001 8380992
Scaffold_61473_SNP_1

## Extraction for trichome significant SNPs

This is the same of 20kwindows but only for some snps significant for trichomes

In [1]:
import pandas as pd

In [4]:
# load database
df = pd.read_csv("./trichome_branching_sig_scaffolds.tsv", sep="\t")

In [6]:
# some scaffolds contain more than one snp, I need to consider this in the extraction
df[df['CHR'].duplicated(keep=False)].sort_values(by="CHR")

Unnamed: 0,SNP,CHR,BP,trait
18,loc104719_pos121,62540,3710393,margin_branch
19,loc104619_pos172,62540,1173139,ab_branch+ad_branch+margin_branch


In [7]:
scaffold_info = {}

for idx, row in df.iterrows():
    if row["CHR"] in scaffold_info.keys(): 
        scaffold_info[row["CHR"]].append(row["BP"])
    else:
        scaffold_info[row["CHR"]] = [row["BP"]]
        
scaffold_info

{3589: [9252545],
 4506: [8488253],
 5036: [12188081],
 5277: [211152],
 17057: [457058],
 17618: [8223181],
 19888: [3195966],
 23621: [2202981],
 28087: [43758310],
 31062: [604045],
 47900: [1597869],
 51506: [1002237],
 56543: [14798389],
 61305: [2136144],
 61434: [9194665],
 61496: [980939],
 62480: [2374442],
 62491: [17612429],
 62540: [3710393, 1173139],
 62878: [7116797],
 62942: [1452977],
 63514: [229274]}

In [8]:
# put in memory genome and perfome extraction of windows, 
# result should be in a multifasta, name will include chr_snp_trait

#set genome file, for easy process, I am using the oneliner version of the genome
genome_file = "/home/carlos/GDRIVE/viburnumThings/Viburnum-Annotation/MAKER/reducedL50/genome/final_assembly_v3_oneline.fa"
window_size = 20000

#produce output file
with open("20kwindows_trichomes.fasta", "a") as file_window:

    #open genome file
    with open(genome_file, "r") as genome:

        #read line by line
        lines = genome.readlines()

        for line in lines:
            #check if current line is a header or a sequence
            if line[0] == ">":
                scaffold_number = int(line[1:].strip().split(";")[0].split("_")[1]) #get only scaffold number to match with CSV
                continue #if is the header, do another loop to get the sequence
            else:
                if scaffold_number in scaffold_info.keys():
                    for snp in scaffold_info[scaffold_number]:
                        new_header = f"Scaffold_{scaffold_number}_SNP_{snp}" #set new header with more information

                        seq = line.strip() #get the sequence

                        #define limits of window and correct them if needed
                        start_window = snp - window_size
                        end_window = snp + window_size + 1
                        if start_window < 0: start_window = 0
                        if end_window > len(seq): end_window = len(seq)

                        #get only the window around the snp
                        window = seq[start_window:end_window]

                        print(new_header)
                        print(window[0:10], len(window), snp)

                        #save every window as independent fasta in a multifasta file
                        file_window.write(f">{new_header}\n{window}\n")

Scaffold_3589_SNP_9252545
atacaatgac 40001 9252545
Scaffold_4506_SNP_8488253
CTTGCCGACA 40001 8488253
Scaffold_5036_SNP_12188081
TCAGCACCTG 40001 12188081
Scaffold_5277_SNP_211152
TCCTCGAAAT 40001 211152
Scaffold_17057_SNP_457058
AAACTATTGA 40001 457058
Scaffold_17618_SNP_8223181
tagttttgag 40001 8223181
Scaffold_19888_SNP_3195966
NNNNNNNNNN 40001 3195966
Scaffold_23621_SNP_2202981
GAGCTCGGGT 40001 2202981
Scaffold_28087_SNP_43758310
ATGTTATTTT 40001 43758310
Scaffold_31062_SNP_604045
CTCCCACAAA 40001 604045
Scaffold_47900_SNP_1597869
cgtactcagc 40001 1597869
Scaffold_51506_SNP_1002237
GATTCATGTG 40001 1002237
Scaffold_56543_SNP_14798389
GATTGTTGCA 40001 14798389
Scaffold_61305_SNP_2136144
AGCCCTCTAA 40001 2136144
Scaffold_61434_SNP_9194665
TTCCTATAAT 40001 9194665
Scaffold_61496_SNP_980939
ATTATGGGGG 40001 980939
Scaffold_62480_SNP_2374442
NNNNNNNNNN 40001 2374442
Scaffold_62491_SNP_17612429
AAAACAAAAG 40001 17612429
Scaffold_62540_SNP_3710393
ATAAGAGAAG 40001 3710393
Scaffold_62540_S