# Produce final table with all information about each significant SNP

In [1]:
import pandas as pd

In [None]:
#Define final table
final_table = pd.DataFrame(columns = ["Scaffold",
            "SNP",
            "P",
            "Zscore",
            "Trait"
            "Gene_Model",
            "Start",
            "End",
            "SNP_in_geneModel_check",
            "ID_in_GFF",
            "Annotation_round"])

In [89]:
#load annotation final tables
big_annotation_table = pd.DataFrame()

annotation_tables = {"20kwindows.all.round1_snp_location.csv":"round_1", 
                     "20kwindows.all.round2_snp_location.csv":"round_2", 
                     "../20kwindows_trichomes/20kwindows_trichomes.all.round1_snp_location.csv":"round_1T",
                     "../20kwindows_trichomes/20kwindows_trichomes.all.round2_snp_location.csv":"round_2T"
            }

#Process each result table
for t in annotation_tables.keys():
    #load result table
    ind_table = pd.read_csv(t)
    ind_table['Annotation_round'] = annotation_tables[t]
    #split info column into independent columns
    ind_table[['_','Scaffold', '_', 'SNP']] = ind_table['window'].str.split('_',expand=True)
    
    #remove unwanted columns
    ind_table.drop('_', axis=1, inplace=True)
    ind_table.drop('window', axis=1, inplace=True)
    ind_table.drop(ind_table.columns[0], axis=1, inplace=True)
    
    #change some names
    ind_table.rename(columns = {"geneModel":"Gene_Model", 
                                  "start":"Start",
                                  "end":"End",
                                  "ID":"ID_in_GFF"}, inplace = True)
    
    big_annotation_table = big_annotation_table.append(ind_table)
    
#resolving some annoying formating
big_annotation_table['Scaffold'] = big_annotation_table['Scaffold'].astype(int)
big_annotation_table['SNP'] = big_annotation_table['SNP'].astype(int)


big_annotation_table

Unnamed: 0,Gene_Model,Start,End,SNP_in_geneModel,ID_in_GFF,Annotation_round,Scaffold,SNP
0,gene 1,19098,20634,True,maker-Scaffold_24083_SNP_9244149-exonerate_est...,round_1,24083,9244149
1,gene 1,24307,25092,False,maker-Scaffold_61262_SNP_4536745-exonerate_est...,round_1,61262,4536745
2,gene 1,16553,21721,True,maker-Scaffold_61434_SNP_8380992-exonerate_est...,round_1,61434,8380992
3,gene 1,19158,28766,True,maker-Scaffold_4057_SNP_1779621-exonerate_est2...,round_1,4057,1779621
4,gene 1,227,1673,False,maker-Scaffold_62050_SNP_2398634-exonerate_est...,round_1,62050,2398634
...,...,...,...,...,...,...,...,...
37,gene 1,10139,33811,True,maker-Scaffold_62480_SNP_2374442-exonerate_est...,round_2T,62480,2374442
38,gene 2,497,817,False,maker-Scaffold_62480_SNP_2374442-exonerate_pro...,round_2T,62480,2374442
39,gene 1,4483,19962,False,maker-Scaffold_62491_SNP_17612429-exonerate_pr...,round_2T,62491,17612429
40,gene 2,25571,26426,False,maker-Scaffold_62491_SNP_17612429-exonerate_pr...,round_2T,62491,17612429


In [90]:
significant_snps_tables = ["significant_loci.csv", "../20kwindows_trichomes/trichome_branching_sig_scaffolds.csv"]
big_sigloci = pd.DataFrame()

for sst in significant_snps_tables: 
    
    sigloci = pd.read_csv(sst)


    sigloci.rename(columns = {"SNP":"loci_info",
                              "CHR":"Scaffold",
                              "BP":"SNP",
                              "trait":"Trait"
                             }, inplace = True)
    
    big_sigloci = big_sigloci.append(sigloci)


big_annotation_table = pd.merge(big_annotation_table, big_sigloci, how ='inner', on = ["Scaffold", "SNP"])
big_annotation_table

Unnamed: 0,Gene_Model,Start,End,SNP_in_geneModel,ID_in_GFF,Annotation_round,Scaffold,SNP,loci_info,P,zscore,Trait
0,gene 1,19098,20634,True,maker-Scaffold_24083_SNP_9244149-exonerate_est...,round_1,24083,9244149,loc36106_pos11,7.360199e-07,-379.7118,teeth
1,gene 1,19098,20634,True,maker-Scaffold_24083_SNP_9244149-exonerate_est...,round_2,24083,9244149,loc36106_pos11,7.360199e-07,-379.7118,teeth
2,gene 1,24307,25092,False,maker-Scaffold_61262_SNP_4536745-exonerate_est...,round_1,61262,4536745,loc79614_pos166,9.021954e-06,-641.6997,adaxial_trichomes
3,gene 1,24307,25092,False,maker-Scaffold_61262_SNP_4536745-exonerate_est...,round_2,61262,4536745,loc79614_pos166,9.021954e-06,-641.6997,adaxial_trichomes
4,gene 2,19654,20522,True,maker-Scaffold_61262_SNP_4536745-exonerate_pro...,round_2,61262,4536745,loc79614_pos166,9.021954e-06,-641.6997,adaxial_trichomes
...,...,...,...,...,...,...,...,...,...,...,...,...
138,gene 5,31690,31866,False,maker-Scaffold_17057_SNP_457058-exonerate_prot...,round_2T,17057,457058,loc25109_pos70,,,vein_branch
139,gene 6,33707,33715,False,maker-Scaffold_17057_SNP_457058-exonerate_prot...,round_2T,17057,457058,loc25109_pos70,,,vein_branch
140,gene 1,4483,19962,False,maker-Scaffold_62491_SNP_17612429-exonerate_pr...,round_2T,62491,17612429,loc103938_pos137,,,vein_branch
141,gene 2,25571,26426,False,maker-Scaffold_62491_SNP_17612429-exonerate_pr...,round_2T,62491,17612429,loc103938_pos137,,,vein_branch


In [93]:
big_annotation_table.drop(["P","zscore","loci_info"], axis=1, inplace=True)

In [102]:
big_annotation_table = big_annotation_table[['Annotation_round','Scaffold', 'SNP', 'Trait', 'Gene_Model', 'Start', 'End', 'SNP_in_geneModel', 'ID_in_GFF']]

In [103]:
big_annotation_table.to_csv("structural_annotation_result.csv", index=False)

In [121]:
big_annotation_table.columns

Index(['Annotation_round', 'Scaffold', 'SNP', 'Trait', 'Gene_Model', 'Start',
       'End', 'SNP_in_geneModel', 'ID_in_GFF'],
      dtype='object')

In [123]:
big_annotation_table.drop('Annotation_round', axis=1, inplace=True)

In [127]:
big_annotation_table.drop_duplicates().to_csv("structural_annotation_result.csv", index=False)

In [1]:
#some stats, count number of genes in structural annotation 
# and check check the number of genes where the snp if in the actual gene model
import pandas as pd

big_annotation_table = pd.read_csv("structural_annotation_result.csv")

In [8]:
#number gene models where the SNP is in the mode
big_annotation_table[big_annotation_table["SNP_in_geneModel"] == True].drop_duplicates("SNP").shape

(27, 8)

In [26]:
# load second rounds for initial annotation and trichome SNPs
tables = ["./Results/MAKER/20kwindows/round_2/20kwindows.all.round2.gff",
          "./Results/MAKER/20kwindows_trichomes/round_2/20kwindows_trichomes.all.round2.gff"
          ]

In [27]:
import pandas as pd

#load tables inline 
individual_dfs = [pd.read_csv(table, sep="\t", comment='#', header=None) for table in tables]


# and concatenate them
df = pd.concat(individual_dfs, ignore_index=True)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Scaffold_60920_SNP_7982493,.,contig,1.0,40001.0,.,.,.,ID=Scaffold_60920_SNP_7982493;Name=Scaffold_60...
1,Scaffold_60920_SNP_7982493,maker,gene,18502.0,21032.0,.,+,.,ID=maker-Scaffold_60920_SNP_7982493-exonerate_...
2,Scaffold_60920_SNP_7982493,maker,mRNA,18502.0,21032.0,37.611,+,.,ID=maker-Scaffold_60920_SNP_7982493-exonerate_...
3,Scaffold_60920_SNP_7982493,maker,exon,18502.0,18507.0,.,+,.,ID=maker-Scaffold_60920_SNP_7982493-exonerate_...
4,Scaffold_60920_SNP_7982493,maker,exon,18992.0,19300.0,.,+,.,ID=maker-Scaffold_60920_SNP_7982493-exonerate_...
...,...,...,...,...,...,...,...,...,...
52025,TTTGCATCCTGTAGTTTGTTCTTCACGGCAATTTGGTTGAGAGAAT...,,,,,,,,
52026,AATATTTGTAGAAATTTTCAAGCTTTCAGTAAAAGGTAACTATGTG...,,,,,,,,
52027,TAGGGTGATTCACTAAGATGATCTGGTCAATATGCAATTGAATCCC...,,,,,,,,
52028,CACGAGTAGAGGTGTCTCTTTGCCTGTGTGTACGTGTAAGAGAGAG...,,,,,,,,


In [28]:
#count number of genes annotated in second round for both subsets
df[df[2] == "gene"].shape

(95, 9)