# Double-cheching significant SNP in gene model after annotation

Routine to see if the significant SNP falls into a predicted gene model.

By exploring the GFF of the structural annotation, we can detect if the SNP is insider the gene model.

1. Detect if the `contig` if the size of the window (40001 for 20k)
2. Detect if the gene model (or genes models) may contain the SNP. The SNP is in the middle of the windows, so if the gene model include this middle point it is included. For example: gene model 1 spans from 6553 to 11721, the SNP located in 10001 is IN the gene model.

In [4]:
import pandas as pd

def find_snp_in_genemodel(gff, window_size):

    df = pd.DataFrame(columns=["window","geneModel","start","end","SNP_in_geneModel","ID"])

    with open(gff, "r") as file:
        lines = file.readlines()
        for line in lines:

            #parse line
            parts = line.split("\t")

            if len(parts) > 2:

                if "contig" in parts[2]:
    #                 print(f"{parts[0]}: {parts[3]}-{parts[4]}")
                    n_gene_models = 0


                if "gene" in parts[2]:
                    n_gene_models += 1
                    snp_in_gene = False
                    if int(parts[3]) < window_size and int(parts[4]) > window_size:
                        snp_in_gene = True

    #                 print(f"\t{parts[2]} {n_gene_models}: {parts[3]}-{parts[4]}. SNP {'IN' if snp_in_gene else 'NOT in'} gene model")

                    df = df.append({"window":parts[0],
                                    "geneModel":f"{parts[2]} {n_gene_models}",
                                    "start":parts[3],
                                    "end":parts[4],
                                    "SNP_in_geneModel":snp_in_gene,
                                    "ID": parts[8].split(";")[0][3:]
                                   }, 
                                   ignore_index=True)

    return df  

In [5]:
result = find_snp_in_genemodel(gff="./round_1/20kwindows.all.round1.gff", window_size=20000)
result

Unnamed: 0,window,geneModel,start,end,SNP_in_geneModel,ID
0,Scaffold_24083_SNP_9244149,gene 1,19098,20634,True,maker-Scaffold_24083_SNP_9244149-exonerate_est...
1,Scaffold_61262_SNP_4536745,gene 1,24307,25092,False,maker-Scaffold_61262_SNP_4536745-exonerate_est...
2,Scaffold_61434_SNP_8380992,gene 1,16553,21721,True,maker-Scaffold_61434_SNP_8380992-exonerate_est...
3,Scaffold_4057_SNP_1779621,gene 1,19158,28766,True,maker-Scaffold_4057_SNP_1779621-exonerate_est2...
4,Scaffold_62050_SNP_2398634,gene 1,227,1673,False,maker-Scaffold_62050_SNP_2398634-exonerate_est...
5,Scaffold_62050_SNP_2398634,gene 2,11114,16070,False,maker-Scaffold_62050_SNP_2398634-exonerate_est...
6,Scaffold_9666_SNP_5105659,gene 1,12343,18354,False,maker-Scaffold_9666_SNP_5105659-exonerate_est2...
7,Scaffold_9666_SNP_5105659,gene 2,19225,21893,True,maker-Scaffold_9666_SNP_5105659-exonerate_est2...
8,Scaffold_9666_SNP_5105659,gene 3,29372,35927,False,maker-Scaffold_9666_SNP_5105659-exonerate_est2...
9,Scaffold_62932_SNP_4009105,gene 1,9271,15022,False,maker-Scaffold_62932_SNP_4009105-exonerate_est...


In [6]:
result.to_csv("20kwindows.all.round1_snp_location.csv")

In [7]:
result = find_snp_in_genemodel(gff="./round_2/20kwindows.all.round2.gff", window_size=20000)
result

Unnamed: 0,window,geneModel,start,end,SNP_in_geneModel,ID
0,Scaffold_60920_SNP_7982493,gene 1,18502,21032,True,maker-Scaffold_60920_SNP_7982493-exonerate_pro...
1,Scaffold_24083_SNP_9244149,gene 1,19098,20634,True,maker-Scaffold_24083_SNP_9244149-exonerate_est...
2,Scaffold_61262_SNP_4536745,gene 1,24307,25092,False,maker-Scaffold_61262_SNP_4536745-exonerate_est...
3,Scaffold_61262_SNP_4536745,gene 2,19654,20522,True,maker-Scaffold_61262_SNP_4536745-exonerate_pro...
4,Scaffold_61262_SNP_4536745,gene 3,15736,18966,False,maker-Scaffold_61262_SNP_4536745-exonerate_pro...
5,Scaffold_61262_SNP_4536745,gene 4,9970,10131,False,maker-Scaffold_61262_SNP_4536745-exonerate_pro...
6,Scaffold_63432_SNP_2670146,gene 1,16898,21420,True,maker-Scaffold_63432_SNP_2670146-exonerate_pro...
7,Scaffold_63432_SNP_2670146,gene 2,35861,36136,False,maker-Scaffold_63432_SNP_2670146-exonerate_pro...
8,Scaffold_61828_SNP_5464180,gene 1,3074,6845,False,maker-Scaffold_61828_SNP_5464180-exonerate_pro...
9,Scaffold_61828_SNP_5464180,gene 2,19739,21926,True,maker-Scaffold_61828_SNP_5464180-exonerate_pro...


In [8]:
result.to_csv("20kwindows.all.round2_snp_location.csv")

## Double check for trichome significant SNPs

In [9]:
import pandas as pd

def find_snp_in_genemodel(gff, window_size):

    df = pd.DataFrame(columns=["window","geneModel","start","end","SNP_in_geneModel","ID"])

    with open(gff, "r") as file:
        lines = file.readlines()
        for line in lines:

            #parse line
            parts = line.split("\t")

            if len(parts) > 2:

                if "contig" in parts[2]:
    #                 print(f"{parts[0]}: {parts[3]}-{parts[4]}")
                    n_gene_models = 0


                if "gene" in parts[2]:
                    n_gene_models += 1
                    snp_in_gene = False
                    if int(parts[3]) < window_size and int(parts[4]) > window_size:
                        snp_in_gene = True

#                     print(f"\t{parts[2]} {n_gene_models}: {parts[3]}-{parts[4]}. SNP {'IN' if snp_in_gene else 'NOT in'} gene model")

                    df = df.append({"window":parts[0],
                                    "geneModel":f"{parts[2]} {n_gene_models}",
                                    "start":parts[3],
                                    "end":parts[4],
                                    "SNP_in_geneModel":snp_in_gene,
                                    "ID": parts[8].split(";")[0][3:]
                                   }, 
                                   ignore_index=True)

    return df  

In [10]:
result = find_snp_in_genemodel(gff="./round_1/20kwindows_trichomes.all.round1.gff", window_size=20000)
result

Unnamed: 0,window,geneModel,start,end,SNP_in_geneModel,ID
0,Scaffold_62942_SNP_1452977,gene 1,16545,23466,True,maker-Scaffold_62942_SNP_1452977-exonerate_est...
1,Scaffold_51506_SNP_1002237,gene 1,36677,37864,False,maker-Scaffold_51506_SNP_1002237-exonerate_est...
2,Scaffold_56543_SNP_14798389,gene 1,24479,25737,False,maker-Scaffold_56543_SNP_14798389-exonerate_es...
3,Scaffold_23621_SNP_2202981,gene 1,4803,5891,False,maker-Scaffold_23621_SNP_2202981-exonerate_est...
4,Scaffold_23621_SNP_2202981,gene 2,26596,26867,False,maker-Scaffold_23621_SNP_2202981-exonerate_est...
5,Scaffold_19888_SNP_3195966,gene 1,18692,36002,True,maker-Scaffold_19888_SNP_3195966-exonerate_est...
6,Scaffold_62540_SNP_3710393,gene 1,19666,21219,True,maker-Scaffold_62540_SNP_3710393-exonerate_est...
7,Scaffold_62540_SNP_3710393,gene 2,21825,22360,False,maker-Scaffold_62540_SNP_3710393-exonerate_est...
8,Scaffold_62540_SNP_3710393,gene 3,21314,26032,False,maker-Scaffold_62540_SNP_3710393-exonerate_est...
9,Scaffold_62540_SNP_3710393,gene 4,29321,29713,False,maker-Scaffold_62540_SNP_3710393-exonerate_est...


In [3]:
result.to_csv("20kwindows_trichomes.all.round1_snp_location.csv")

In [4]:
result = find_snp_in_genemodel(gff="./round_2/20kwindows_trichomes.all.round2.gff", window_size=20000)
result

Unnamed: 0,window,geneModel,start,end,SNP_in_geneModel,ID
0,Scaffold_62942_SNP_1452977,gene 1,16545,23466,True,maker-Scaffold_62942_SNP_1452977-exonerate_est...
1,Scaffold_62942_SNP_1452977,gene 2,31121,31204,False,maker-Scaffold_62942_SNP_1452977-exonerate_pro...
2,Scaffold_51506_SNP_1002237,gene 1,36677,37864,False,maker-Scaffold_51506_SNP_1002237-exonerate_est...
3,Scaffold_56543_SNP_14798389,gene 1,24479,25737,False,maker-Scaffold_56543_SNP_14798389-exonerate_es...
4,Scaffold_56543_SNP_14798389,gene 2,19545,20135,True,maker-Scaffold_56543_SNP_14798389-exonerate_pr...
5,Scaffold_56543_SNP_14798389,gene 3,28023,29138,False,maker-Scaffold_56543_SNP_14798389-exonerate_pr...
6,Scaffold_56543_SNP_14798389,gene 4,29979,30260,False,maker-Scaffold_56543_SNP_14798389-exonerate_pr...
7,Scaffold_23621_SNP_2202981,gene 1,4803,5891,False,maker-Scaffold_23621_SNP_2202981-exonerate_est...
8,Scaffold_23621_SNP_2202981,gene 2,26596,26867,False,maker-Scaffold_23621_SNP_2202981-exonerate_est...
9,Scaffold_23621_SNP_2202981,gene 3,5887,6270,False,maker-Scaffold_23621_SNP_2202981-exonerate_pro...


In [5]:
result.to_csv("20kwindows_trichomes.all.round2_snp_location.csv")