In [None]:
import collections
import pandas as pd
import pathlib
import csv
import os
from BCBio import GFF
from BCBio.GFF import GFFExaminer
import pprint
import gffpandas.gffpandas as gffpd

In [None]:
# Function for truncating floats
def truncate(f, n):
    '''Truncates/pads a float f to n decimal places without rounding'''
    s = '{}'.format(f)
    if 'e' in s or 'E' in s:
        return '{0:.{1}f}'.format(f, n)
    i, p, d = s.partition('.')
    inter = '.'.join([i, (d+'0'*n)[:n]])
    return(float(inter))

In [None]:
# Define LD for each chromosome in a named tuple, according to [Sharma et al., 2018] (https://doi.org/10.1534/g3.118.200377)
LDs = {"chr01": 0.89,
       "chr02": 1.05,
       "chr03": 1.14,
       "chr04": 0.86,
       "chr05": 0.70,
       "chr06": 1.11,
       "chr07": 0.82,
       "chr08": 0.93,
       "chr09": 0.67,
       "chr10": 0.96,
       "chr11": 0.71,
       "chr12": 0.66}

In [None]:
# Testing stuff on the model of interest

# Relative input string for the pandas module
relPathString = "../../data/GWAS_results/"
path = relPathString + "scoresGen.csv"

inPathString = pathlib.Path(path)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
#scoreSortedAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
#scoreSortedRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)

In [None]:
print(scoreFrame.head(n = 10), '\n')
#print(scoreSortedAlt.head(n = 10), '\n')
#print(scoreSortedRef.head(n = 10), '\n')
scoreFrameBest = scoreFrame.head(n = 10)

In [None]:
scoreFrameAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
scoreFrameRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)
print(scoreFrameAlt.head(n = 10), '\n')
print(scoreFrameRef.head(n = 10), '\n')

In [None]:
scoreSortedAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
scoreSortedRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)
# Check for lowest score between REF and ALT scores
if truncate(scoreSortedAlt.iloc[4, 5],2) < truncate(scoreSortedRef.iloc[4, 6],2):
    bestTenThresh = truncate((scoreSortedAlt.iloc[4, 5] + scoreSortedAlt.iloc[5, 5])/2, 2)
else:
    bestTenThresh = truncate((scoreSortedRef.iloc[4, 6] + scoreSortedRef.iloc[5, 6])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreSortedRef.columns)[5][:5]} model: {bestTenThresh}")
bestList = scoreSortedAlt.iloc[:5,0].append(scoreSortedRef.iloc[:5,0])
bestListTemplate = scoreSortedAlt.iloc[:5].append(scoreSortedRef.iloc[:5])

In [None]:
bestListTemplate

## Determining and saving the 10 best SNPs per model and their associated LD SNPs

In [None]:
############################################
#                                          #
# Reading the score files for all 6 models #
#                                          #
############################################

# Relative input string for the pandas module
relPathString = "../../data/GWAS_results/"
pathAdd = relPathString + "scoresAdd.csv"
pathOne = relPathString + "scoresOne.csv"
pathTwo = relPathString + "scoresTwo.csv"
pathDipGen = relPathString + "scoresDipGen.csv"
pathDipAdd = relPathString + "scoresDipAdd.csv"
pathGen = relPathString + "scoresGen.csv"


#########################
#                       #
# Defining output paths #
#                       #
#########################

# Defining path for best scoring SNPs output
relSNPPath = "../../analysis/GWAS_results/"
pathSNPAdd = relSNPPath + "bestSNPsAdd.csv"
pathSNPOne = relSNPPath + "bestSNPsOne.csv"
pathSNPTwo = relSNPPath + "bestSNPsTwo.csv"
pathSNPDipGen = relSNPPath + "bestSNPsDipGen.csv"
pathSNPDipAdd = relSNPPath + "bestSNPsDipAdd.csv"
pathSNPGen = relSNPPath + "bestSNPsGen.csv"

# Defining path for LD coupled best scoring SNPs output
relLDSNPPath = "../../analysis/GWAS_results/"
pathLDSNPAdd = relSNPPath + "LDSNPsAdd.csv"
pathLDSNPOne = relSNPPath + "LDSNPsOne.csv"
pathLDSNPTwo = relSNPPath + "LDSNPsTwo.csv"
pathLDSNPDipGen = relSNPPath + "LDSNPsDipGen.csv"
pathLDSNPDipAdd = relSNPPath + "LDSNPsDipAdd.csv"
pathLDSNPGen = relSNPPath + "LDSNPsGen.csv"

# Creating dictionary of in- and output path pairs
pathDict = {
    "Add": [pathAdd, pathSNPAdd, pathLDSNPAdd],
    "One": [pathOne, pathSNPOne, pathLDSNPOne],
    "Two": [pathTwo, pathSNPTwo, pathLDSNPTwo],
    "DipGen": [pathDipGen, pathSNPDipGen, pathLDSNPDipGen],
    "DipAdd": [pathDipAdd, pathSNPDipAdd, pathLDSNPDipAdd],
    "Gen": [pathGen, pathSNPGen, pathLDSNPGen]
}

# Sorting by score and identify 10 best SNPs and threshold for suggestive line in manhattan plot
for key, values in pathDict.items():
    # converting in- and output path to absolute path strings
    # Input path
    inPathString = pathlib.Path(values[0])
    inAbsPath = inPathString.resolve(strict=True)
    inAbsPath = str(inAbsPath)
    # Output paths
    outAbsPath = os.path.abspath(values[1])
    outAbsPathLD = os.path.abspath(values[2])
    # Creating data frame from input file
    scoreFrame = pd.read_csv(inAbsPath)

    
    # All models with only one score column
    if values[0] in [pathAdd, pathDipAdd, pathDipGen, pathGen]:
        # Sorting the SNPs descending, NAs at the bottom
        scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
        # Find and print the suggested threshold between 10th and 11th best ranking SNPs
        bestTenThresh = truncate((scoreFrame.iloc[9, 5] + scoreFrame.iloc[10, 5])/2, 2)
        print(f"Threshold for 10 best SNPs in the {list(scoreFrame.columns)[5]} model: {bestTenThresh}")
        # Create best list containing only marker names
        bestList = scoreFrame.iloc[:10,0]
        bestListTemplate = scoreFrame.iloc[:10]
        LDBestList = scoreFrame.iloc[:10]
        # Write output of best scoring SNPs for each model to file
        with open(outAbsPath, 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(bestList)
        
        # Determine all valid SNPs in LD and write to output path
        for snp in bestListTemplate.iterrows():
            # Getting important values of the high-scoring SNP
            chrom = snp[1][1]
            pos = snp[1][2]
            #print(f"{snp[1][0]} lies on {chrom} at position {pos}")
            window = LDs[chrom] * 10**6
            #print(window)
            rightMax = pos + window
            leftMax = pos - window
            # Append prelBestList by every SNP from the list within the same chromosome and within the length of the window
            for SNP in scoreFrame.iterrows():
                SNPCHROM = SNP[1][1]
                SNPPOS = SNP[1][2]
                score = SNP[1][5]
                # print(f"{SNP[1][0]} lies on {SNPCHROM} at position {SNPPOS}")
                # Check for position of potentially appendable SNPs
                if SNPCHROM == chrom and score != 'nan':
                    if SNPPOS <= rightMax and SNPPOS >= leftMax:
                        LDBestList = LDBestList.append(scoreFrame.loc[SNP[0]])
        # Write all SNPs in LD to output file
        markerNames = LDBestList.iloc[:,0]
        with open(outAbsPathLD, 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(markerNames)
                        
    # Simplex and duplex models
    else:
        scoreSortedAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
        scoreSortedRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)
        # Check for lowest score between REF and ALT scores
        if truncate(scoreSortedAlt.iloc[4, 5],2) < truncate(scoreSortedRef.iloc[4, 6],2):
            bestTenThresh = truncate((scoreSortedAlt.iloc[4, 5] + scoreSortedAlt.iloc[5, 5])/2, 2)
        else:
            bestTenThresh = truncate((scoreSortedRef.iloc[4, 6] + scoreSortedRef.iloc[5, 6])/2, 2)
        print(f"Threshold for 10 best SNPs in the {list(scoreSortedRef.columns)[5][:5]} model: {bestTenThresh}")
        bestList = scoreSortedAlt.iloc[:5,0].append(scoreSortedRef.iloc[:5,0])
        bestListTemplate = scoreSortedAlt.iloc[:5].append(scoreSortedRef.iloc[:5])
        LDBestList = scoreSortedAlt.iloc[:5].append(scoreSortedRef.iloc[:5])
        #Write output of best SNPs to file
        with open(outAbsPath, 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(bestList)

        # Determine all valid SNPs in LD and write to output path
        for snp in bestListTemplate.iterrows():
            # Checking, if SNP has scored as ALT or REF model for getting LD SNPs that possess scores
            if snp[1][5] != 'nan':
                ALT = True
            else:
                ALT = False
            # Getting important values of the high-scoring SNP
            chrom = snp[1][1]
            pos = snp[1][2]
            #print(f"{snp[1][0]} lies on {chrom} at position {pos}")
            window = LDs[chrom] * 10**6
            #print(window)
            rightMax = pos + window
            leftMax = pos - window
            # Append prelBestList by every SNP from the list within the same chromosome and within the length of the window
            for SNP in scoreFrame.iterrows():
                SNPCHROM = SNP[1][1]
                SNPPOS = SNP[1][2]
                if ALT == True:
                    score = SNP[1][5]
                else:
                    score = SNP[1][6]
                #print(f"{SNP[1][0]} lies on {SNPCHROM} at position {SNPPOS}")
                # Check for position of potentially appendable SNPs
                if SNPCHROM == chrom and score != 'nan':
                    if SNPPOS <= rightMax and SNPPOS >= leftMax:
                        LDBestList = LDBestList.append(scoreFrame.loc[SNP[0]])
        # Write all SNPs in LD to output file
        markerNames = LDBestList.iloc[:,0]
        with open(outAbsPathLD, 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(markerNames)

## Deriving table with best SNPs and additional info for MA

In [None]:
def LDlower(pos, chrom):
    # Define LD for each chromosome as a dictionary
    LDs = {"chr01": 0.89,
           "chr02": 1.05,
           "chr03": 1.14,
           "chr04": 0.86,
           "chr05": 0.70,
           "chr06": 1.11,
           "chr07": 0.82,
           "chr08": 0.93,
           "chr09": 0.67,
           "chr10": 0.96,
           "chr11": 0.71,
           "chr12": 0.66}
    
    realLD = LDs[chrom]*10**6
    lower = pos - realLD
    if lower < 0:
        return 0
    else:
        return lower

def LDupper(pos, chrom):
    # Define LD for each chromosome as a dictionary
    LDs = {"chr01": 0.89,
           "chr02": 1.05,
           "chr03": 1.14,
           "chr04": 0.86,
           "chr05": 0.70,
           "chr06": 1.11,
           "chr07": 0.82,
           "chr08": 0.93,
           "chr09": 0.67,
           "chr10": 0.96,
           "chr11": 0.71,
           "chr12": 0.66}
    # Define chromosome lengths (according to reference genome) for preventing surpassing chromosome boundaries
    lengths = {
        "chr01": 88663952,
        "chr02": 48614681,
        "chr03": 62290286,
        "chr04": 72208621,
        "chr05": 52070158,
        "chr06": 59532096,
        "chr07": 56760843,
        "chr08": 56938457,
        "chr09": 61540751,
        "chr10": 59756223,
        "chr11": 45475667,
        "chr12": 61165649
    }
    realLD = LDs[chrom]*10**6
    upper = pos + realLD
    if upper > lengths[chrom]:
        return lengths[chrom]
    else:
        return upper   

In [None]:
bestTenAdd['Position'][24042]

In [None]:
x, y = LD(bestTenAdd['Position'][24042], bestTenAdd['Chrom'][24042])

In [None]:
list(bestTenAdd.columns)

In [None]:
############################################
#                                          #
# Reading the score files for all 6 models #
#                                          #
############################################

# Relative input string for the pandas module
relPathString = "../../data/GWAS_results/"
pathAdd = relPathString + "scoresAdd.csv"
pathOne = relPathString + "scoresOne.csv"
pathTwo = relPathString + "scoresTwo.csv"
pathDipGen = relPathString + "scoresDipGen.csv"
pathDipAdd = relPathString + "scoresDipAdd.csv"
pathGen = relPathString + "scoresGen.csv"


##################
#                #
# Additive model #
#                #
##################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathAdd)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
# Find and print the suggested threshold between 10th and 11th best ranking SNPs
bestTenThreshAdd = truncate((scoreFrame.iloc[9, 5] + scoreFrame.iloc[10, 5])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrame.columns)[5]} model: {bestTenThreshAdd}")
# Create data frame with 10 best SNPs and additional information
bestTenAdd = scoreFrame.iloc[:10,]
bestTenAdd['Model'] = list(bestTenAdd.columns)[5]
bestTenAdd = bestTenAdd.rename(columns={"additive":"Score"})
bestTenAdd['Score'] = bestTenAdd.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenAdd['LD-lower'] = bestTenAdd.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenAdd['LD-upper'] = bestTenAdd.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)

#################
#               #
# Simplex model #
#               #
#################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathOne)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrameAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
scoreFrameRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)

# Find and print the suggested threshold between 10th and 11th best ranking SNPs
if truncate(scoreFrameAlt.iloc[4, 5],2) < truncate(scoreFrameRef.iloc[4, 6],2):
    bestTenThresh = truncate((scoreFrameAlt.iloc[4, 5] + scoreFrameAlt.iloc[5, 5])/2, 2)
else:
    bestTenThresh = truncate((scoreFrameRef.iloc[4, 6] + scoreFrameRef.iloc[5, 6])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrameRef.columns)[5][:5]} model: {bestTenThresh}")
# First for the alternative dominant model
bestTenOneAlt = scoreFrameAlt.iloc[:5,]
bestTenOneAlt['Model'] = list(bestTenOneAlt.columns)[5]
bestTenOneAlt = bestTenOneAlt.rename(columns={"1-dom-alt":"Score"})
bestTenOneAlt['Score'] = bestTenOneAlt.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenOneAlt['LD-lower'] = bestTenOneAlt.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenOneAlt['LD-upper'] = bestTenOneAlt.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)
# Then for the reference dominant model
bestTenOneRef = scoreFrameRef.iloc[:5,]
bestTenOneRef['Model'] = list(bestTenOneRef.columns)[6]
bestTenOneRef = bestTenOneRef.rename(columns={"1-dom-ref":"Score"})
bestTenOneRef['Score'] = bestTenOneRef.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenOneRef['LD-lower'] = bestTenOneRef.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenOneRef['LD-upper'] = bestTenOneRef.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)
# Concatenating both models
bestTenOne = pd.concat([bestTenOneAlt, bestTenOneRef])


################
#              #
# Duplex model #
#              #
################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathTwo)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrameAlt = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
scoreFrameRef = scoreFrame.sort_values(by=[list(scoreFrame.columns)[6]], ascending=False)

# Find and print the suggested threshold between 10th and 11th best ranking SNPs
if truncate(scoreFrameAlt.iloc[4, 5],2) < truncate(scoreFrameRef.iloc[4, 6],2):
    bestTenThresh = truncate((scoreFrameAlt.iloc[4, 5] + scoreFrameAlt.iloc[5, 5])/2, 2)
else:
    bestTenThresh = truncate((scoreFrameRef.iloc[4, 6] + scoreFrameRef.iloc[5, 6])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrameRef.columns)[5][:5]} model: {bestTenThresh}")
# Create data frame with 10 best SNPs and additional information
# First for the alternative dominant model
bestTenTwoAlt = scoreFrameAlt.iloc[:5,]
bestTenTwoAlt['Model'] = list(bestTenTwoAlt.columns)[5]
bestTenTwoAlt = bestTenTwoAlt.rename(columns={"2-dom-alt":"Score"})
bestTenTwoAlt['Score'] = bestTenTwoAlt.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenTwoAlt['LD-lower'] = bestTenTwoAlt.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenTwoAlt['LD-upper'] = bestTenTwoAlt.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)
# Then for the reference dominant model
bestTenTwoRef = scoreFrameRef.iloc[:5,]
bestTenTwoRef['Model'] = list(bestTenTwoRef.columns)[6]
bestTenTwoRef = bestTenTwoRef.rename(columns={"2-dom-ref":"Score"})
bestTenTwoRef['Score'] = bestTenTwoRef.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenTwoRef['LD-lower'] = bestTenTwoRef.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenTwoRef['LD-upper'] = bestTenTwoRef.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)
# Concatenating both models
bestTenTwo = pd.concat([bestTenTwoAlt, bestTenTwoRef])


#########################
#                       #
# Diploid general model #
#                       #
#########################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathDipGen)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
# Find and print the suggested threshold between 10th and 11th best ranking SNPs
bestTenThresh = truncate((scoreFrame.iloc[9, 5] + scoreFrame.iloc[10, 5])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrame.columns)[5]} model: {bestTenThresh}")
# Create data frame with 10 best SNPs and additional information
bestTenDipGen = scoreFrame.iloc[:10,]
bestTenDipGen['Model'] = list(bestTenDipGen.columns)[5]
bestTenDipGen = bestTenDipGen.rename(columns={"diplo-general":"Score"})
bestTenDipGen['Score'] = bestTenDipGen.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenDipGen['LD-lower'] = bestTenDipGen.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenDipGen['LD-upper'] = bestTenDipGen.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)


##########################
#                        #
# Diploid additive model #
#                        #
##########################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathDipAdd)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
# Find and print the suggested threshold between 10th and 11th best ranking SNPs
bestTenThresh = truncate((scoreFrame.iloc[9, 5] + scoreFrame.iloc[10, 5])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrame.columns)[5]} model: {bestTenThresh}")
# Create data frame with 10 best SNPs and additional information
bestTenDipAdd = scoreFrame.iloc[:10,]
bestTenDipAdd['Model'] = list(bestTenDipAdd.columns)[5]
bestTenDipAdd = bestTenDipAdd.rename(columns={"diplo-additive":"Score"})
bestTenDipAdd['Score'] = bestTenDipAdd.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenDipAdd['LD-lower'] = bestTenDipAdd.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenDipAdd['LD-upper'] = bestTenDipAdd.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)

#################
#               #
# General model #
#               #
#################

# converting in- and output path to absolute path strings
inPathString = pathlib.Path(pathGen)
inAbsPath = inPathString.resolve(strict=True)
inAbsPath = str(inAbsPath)
# Creating data frame from input file
scoreFrame = pd.read_csv(inAbsPath)
scoreFrame = scoreFrame.sort_values(by=[list(scoreFrame.columns)[5]], ascending=False)
# Find and print the suggested threshold between 10th and 11th best ranking SNPs
bestTenThresh = truncate((scoreFrame.iloc[9, 5] + scoreFrame.iloc[10, 5])/2, 2)
print(f"Threshold for 10 best SNPs in the {list(scoreFrame.columns)[5]} model: {bestTenThresh}")
# Create data frame with 10 best SNPs and additional information
bestTenGen = scoreFrame.iloc[:10,]
bestTenGen['Model'] = list(bestTenGen.columns)[5]
bestTenGen = bestTenGen.rename(columns={"general":"Score"})
bestTenGen['Score'] = bestTenGen.apply(lambda x: truncate(x['Score'], 2), axis=1)
bestTenGen['LD-lower'] = bestTenGen.apply(lambda x: LDlower(x['Position'], x['Chrom']), axis=1)
bestTenGen['LD-upper'] = bestTenGen.apply(lambda x: LDupper(x['Position'], x['Chrom']), axis=1)

bigFrame = pd.concat([bestTenAdd, bestTenOne, bestTenTwo, bestTenDipGen, bestTenDipAdd, bestTenGen])
bigFrame = bigFrame.iloc[:,0:9]

In [None]:
bigFrame.sort_values(by=[list(bigFrame.columns)[1], list(bigFrame.columns)[2]], ascending=True)

In [None]:
bigFrame['Marker'].value_counts()

In [None]:
bestTenGen.sort_values(by=[list(bestTenGen.columns)[1], list(bestTenGen.columns)[2]], ascending=True)

## Identifying candidate genes from the annotation file

In [None]:
# Defining 'high scoring clusters'
# Key is cluster name, values are chromosome, start, and end of the clusters
clusters = pd.DataFrame({
    'Name':['chr01.1', 'chr01.2', 'chr01.3', 'chr02.1', 'chr02.2', 'chr02.3', 'chr02.4', 'chr02.5', 'chr03.1', 'chr03.2', 'chr07.1', 'chr09.1', 'chr09.2', 'chr09.3', 'chr09.4', 'chr10.1', 'chr10.2', 'chr11.1', 'chr12.1', 'chr12.2', 'chr12.3', 'chr12.4']
    
})
# Defining function for chromosome names and appending the data frame by it
def chrom(string):
    return string[:5]
clusters['Chromosome'] = clusters.apply(lambda x: chrom(x['Name']), axis=1)

# Appending start and end of the clusters
clusters['Start'] = [42240780, 66206226, 67554192, 24815002, 27617140, 37655544, 42028257, 43512360, 45083029, 51893032, 52226902, 4882536, 5819191, 53405466, 54899227, 49070040, 53903541, 42632639, 750038, 58560145, 59432597, 60343136]
clusters['End'] = [44020780, 68228593, 69510680, 26915002, 30659098, 39755544, 44128257, 45612361, 47363029, 54173032, 53866902, 6266242, 7780360, 54745466, 56467680, 50990040, 55823541, 44052639, 2070038, 59880145, 60752597, 61165649]
clusters['Length'] = clusters['End'] - clusters['Start']

In [None]:
clusters

In [None]:
# Storing starch-related genes and transcripts from [Van Harsselaar et al., 2017] (https://doi.org/10.1186/s12864-016-3381-z)
starchGenes = [
    ['PGSC0003DMG400009026', 'PGSC0003DMT400023304', 'ADP-glucose pyrophosphorylaselarge subunit 1 (AGPL1)'],
    ['PGSC0003DMG400015952', 'PGSC0003DMT400041215', 'ADP-glucose pyrophosphorylaselarge subunit 2 (AGPL2)'],
    ['PGSC0003DMG400000735', 'PGSC0003DMT400001935', 'ADP-glucose pyrophosphorylaselarge subunit 3 (AGPL3)'],
    ['PGSC0003DMG400031084', 'PGSC0003DMT400079823', 'ADP-glucose pyrophosphorylasesmall subunit 1.1 (AGPS1.1)'],
    ['PGSC0003DMG400046891', 'PGSC0003DMT400097320', 'ADP-glucose pyrophosphorylasesmall subunit 1.2 (AGPS1.2)'],
    ['PGSC0003DMG400025218', 'PGSC0003DMT400064936', 'ADP-glucose pyrophosphorylasesmall subunit 2 (AGPS2)'],
    ['PGSC0003DMG400007974', 'PGSC0003DMT400020591', 'Alpha-amylase 1.1 (AMY1.1)'],
    ['PGSC0003DMG400020603', 'PGSC0003DMT400053110', 'Alpha-amylase 1.2 (AMY1.2)'],
    ['PGSC0003DMG400009891', 'PGSC0003DMT400025601', 'Alpha-amylase 2 (AMY23)'],
    ['PGSC0003DMG401017626', 'PGSC0003DMT400045435', 'Alpha-amylase 3 (AMY3)'],
    ['PGSC0003DMG400007782', 'PGSC0003DMT400020094', 'Alpha-glucan phosphorylase 1a(PHO1a)'],
    ['PGSC0003DMG400003495', 'PGSC0003DMT400008970', 'Alpha-glucan phosphorylase 1a(PHO1a)'],
    ['PGSC0003DMG400002479', 'PGSC0003DMT400006337', 'Alpha-glucan phosphorylase 1a(PHO1a)'],
    ['PGSC0003DMG400028382', 'PGSC0003DMT400072963', 'Alpha-glucan phosphorylase 1b(PHO1b)'],
    ['PGSC0003DMG400031765', 'PGSC0003DMT400081273', 'Alpha-glucan phosphorylase 2b(PHO2b)'],
    ['PGSC0003DMG400005612', 'PGSC0003DMT400014304', 'ATP-ADP antiporter 1 (NTT1)'],
    ['PGSC0003DMG400028641', 'PGSC0003DMT400073724', 'ATP-ADP antiporter 2 (NTT2)'],
    ['PGSC0003DMG400001549', 'PGSC0003DMT400003933', 'Beta-amylase 1 (BAM1)'],
    ['PGSC0003DMG400024145', 'PGSC0003DMT400062050', 'Beta-amylase 2 (BAM2)'],
    ['PGSC0003DMG400001855', 'PGSC0003DMT400004686', 'Beta-amylase 3.1 (BAM3.1)'],
    ['PGSC0003DMG402020509', 'PGSC0003DMT400052839', 'Beta-amylase 3.2 (BAM3.2)'],
    ['PGSC0003DMG400012129', 'PGSC0003DMT400031627', 'Beta-amylase 4 (BAM4)'],
    ['PGSC0003DMG400026199', 'PGSC0003DMT400067403', 'Beta-amylase 6.1 (BAM6.1)'],
    ['PGSC0003DMG400026166', 'PGSC0003DMT400067289', 'Beta-amylase 6.2 (BAM6.2)'],
    ['PGSC0003DMG400026198', 'PGSC0003DMT400067400', 'Beta-amylase 6.3 (BAM6.3)'],
    ['PGSC0003DMG400000169', 'PGSC0003DMT400000485', 'Beta-amylase 7 (BAM7)'],
    ['PGSC0003DMG400010664', 'PGSC0003DMT400027659', 'Beta-amylase 9 (BAM9)'],
    ['PGSC0003DMG400022307', 'PGSC0003DMT400057446', 'Branching enzyme I.1 (SBE1.1)'],
    ['PGSC0003DMG400009981', 'PGSC0003DMT400025846', 'Branching enzyme III (SBE3)'],
    ['PGSC0003DMG400016589', 'PGSC0003DMT400042739', 'Disproportionating enzyme 1 (DPE1)'],
    ['PGSC0003DMG400007677', 'PGSC0003DMT400019845', 'Glucan water dikinase (GWD)'],
    ['PGSC0003DMG400026402', 'PGSC0003DMT400067884', 'Glucose transporter (GLT1)'],
    ['PGSC0003DMG400001041', 'PGSC0003DMT400002701', 'Glucose-6-phosphate translocator 1.1(GPT1.1)'],
    ['PGSC0003DMG400005602', 'PGSC0003DMT400014284', 'Glucose-6-phosphate translocator 1.1(GPT1.1)'],
    ['PGSC0003DMG400005269', 'PGSC0003DMT400013500', 'Glucose-6-phosphate translocator 2.1(GPT2.1)'],
    ['PGSC0003DMG400025495', 'PGSC0003DMT400065527', 'Glucose-6-phosphate translocator 2.2(GPT2.2)'],
    ['PGSC0003DMG400012111', 'PGSC0003DMT400031568', 'Granule bound starch synthase 1(GBSS1)'],
    ['PGSC0003DMG400003103', 'PGSC0003DMT400008028', 'Inorganic pyrophosphatase (PPase)'],
    ['PGSC0003DMG400026784', 'PGSC0003DMT400068875', 'Inorganic pyrophosphatase-like(PPase-like)'],
    ['PGSC0003DMG400020699', 'PGSC0003DMT400053345', 'Isoamylase 1.1 (ISA1.1)'],
    ['PGSC0003DMG400030253', 'PGSC0003DMT400077770', 'Isoamylase 1.2 (ISA 1.2)'],
    ['PGSC0003DMG400000954', 'PGSC0003DMT400002502', 'Isoamylase 2 (ISA2)'],
    ['PGSC0003DMG402007274', 'PGSC0003DMT400018766', 'Isoamylase 3 (ISA3)'],
    ['PGSC0003DMG401007274', 'PGSC0003DMT400018765', 'Isoamylase 3 (ISA3)'],
    ['PGSC0003DMG400024812', 'PGSC0003DMT400063824', 'Maltose excess 1 (MEX1)'],
    ['PGSC0003DMG400030092', 'PGSC0003DMT400077364', 'Phosphoglucan phosphatase(like SEX four 1, LSF1)'],
    ['PGSC0003DMG400029073', 'PGSC0003DMT400074765', 'Phosphoglucan phosphatase(like SEX four 2, LSF2)'],
    ['PGSC0003DMG400015246', 'PGSC0003DMT400039423', 'Phosphoglucan phosphatase (SEX4)'],
    ['PGSC0003DMG400027327', 'PGSC0003DMT400070294', 'Phosphoglucan phosphatase(SEX4-like)'],
    ['PGSC0003DMG400016613', 'PGSC0003DMT400042818', 'Phosphoglucan water dikinase(PWD)'],
    ['PGSC0003DMG400012910', 'PGSC0003DMT400033620', 'Phosphoglucoisomerase (PGI)'],
    ['PGSC0003DMG400015341', 'PGSC0003DMT400039665', 'Phosphoglucoisomerase-like 1(PGI-like1)'],
    ['PGSC0003DMG400030128', 'PGSC0003DMT400077470', 'Phosphoglucoisomerase-like 2(PGI-like2)'],
    ['PGSC0003DMG402018552', 'PGSC0003DMT400047731', 'Starch Synthase I (SS1)'],
    ['PGSC0003DMG400001328', 'PGSC0003DMT400003356', 'Starch Synthase II (SS2)'],
    ['PGSC0003DMG400016481', 'PGSC0003DMT400042496', 'Starch Synthase III (SS3)'],
    ['PGSC0003DMG400008322', 'PGSC0003DMT400021444', 'Starch Synthase IV (SS4)'],
    ['PGSC0003DMG400030619', 'PGSC0003DMT400078688', 'Starch Synthase V (SS5)'],
    ['PGSC0003DMG402013540', 'PGSC0003DMT400035218', 'Starch Synthase VI (SS6)'],
    ['PGSC0003DMG400013547', 'PGSC0003DMT400035264', 'Sucrose Synthase 1 (SuSy1)'],
    ['PGSC0003DMG400013546', 'PGSC0003DMT400035262', 'Sucrose Synthase 2 (SuSy2)'],
    ['PGSC0003DMG400006672', 'PGSC0003DMT400017087', 'Sucrose Synthase 3 (SuSy3)'],
    ['PGSC0003DMG400002895', 'PGSC0003DMT400007506', 'Sucrose Synthase 4 (SuSy4)'],
    ['PGSC0003DMG400031046', 'PGSC0003DMT400079728', 'Sucrose Synthase 6 (SuSy6)'],
    ['PGSC0003DMG400016730', 'PGSC0003DMT400043117', 'Sucrose Synthase 7 (SuSy7)'],
    ['PGSC0003DMG400022832', 'PGSC0003DMT400058772', 'Triose-phosphate/phosphatetranslocator (TPT)'],
    ['PGSC0003DMG401013333', 'PGSC0003DMT400034699', 'UDP-glucose pyrophosphorylase 2(UGPase2)'],
    ['PGSC0003DMG401010374', 'PGSC0003DMT400026885', 'Vacuolar Glucose Transporter 3-like(VGT3-like)']
]

# Extracting the identifiers for identifying starch genes in GFF3 file
PGSCs = [gene[0] for gene in starchGenes]

In [None]:
# Creating in- and output file names
inPath = "../../data/Reference_genome_and_annotation_file/pgsc_unducked.gff"

# Create absolute path and read the file
absInPath = os.path.abspath(inPath)
gffFile = gffpd.read_gff3(absInPath)

# Extracting only the genes
gffFileGenes = gffFile.df[gffFile.df['type'] == 'gene']

In [None]:
gffFile.df[gffFile.df['type'] == 'gene']

In [None]:
def IDextract(attrString):
    IDString = attrString.split(';')[0]
    geneID = IDString.split('=')[1]
    return geneID

In [None]:
# Initiate empty lists 
# numGenes = []
for cluster in clusters.iterrows():
    chrom = cluster[1][1]
    start = cluster[1][2]
    end = cluster[1][3]
    # Extracting genes only belonging to the chromosome and region of interest 
    gffFileGenesSub = gffFileGenes[(gffFileGenes['seq_id'] == chrom) & (gffFileGenes['start'] >= start) & (gffFileGenes['end'] <= end)]
    # Testing the lengths
    #print(gffFileGenesSub.shape[0])
    #print(gffFileGenesSub.head(n = 5))
    # Extract the number of genes in each cluster
    #numGenes.append(gffFileGenesSub.shape[0])
    #print(gffFileGenesSub['attributes'])
    # Create new data frame column containing only the gene IDs for filtering starch genes
    gffFileGenesSub['geneID'] = gffFileGenesSub.apply(lambda x: IDextract(x['attributes']), axis=1)
    #print(gffFileGenesSub.head(n = 5))
    #print(gffFileGenesSub['geneID'] in PGSCs)
    gffFileGenesSubStarch = gffFileGenesSub[gffFileGenesSub['geneID'].isin (PGSCs)]
    print(cluster[1][0])
    print(gffFileGenesSubStarch.shape)
    print(gffFileGenesSubStarch, '\n')
    #clusters['numGenes'] = numGenes

In [None]:
clusters['GenperMB'] = clusters['numGenes'] / (clusters['Length']/10**6)

In [None]:
clusters

In [None]:
clusters['Length']/(10**6)

In [None]:
# Getting some basic stats from the GFF3 file
# Creating in- and output file names
inPath = "../../data/Reference_genome_and_annotation_file/pgsc_unducked.gff"

# Create absolute path and read the file
absInPath = os.path.abspath(inPath)

examiner = GFFExaminer()
in_handle = open(absInPath)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()