Purpose: Test if any of the core genes overlap with the meta-QTL identified in this paper: https://link-springer-com.proxy1.cl.msu.edu/article/10.1007/s11032-022-01294-9#Sec22<br>
Author: Anna Pardo<br>
Date initiated: Sept. 6, 2023

In [21]:
# import modules
import pandas as pd
import scipy.stats as stats

In [5]:
# load meta-QTL data
# MUST LOAD EXCEL FILE INSTEAD OF CSV
mqtl = pd.read_excel("../../data/metaQTL.xlsx")

In [6]:
mqtl.head()

Unnamed: 0,MQTL,Gene_name,Chr,Gene_description
0,MQTL1.1,Zm00001d027702,1.0,Zinc finger C2H2 type family protein expressed
1,MQTL1.1,Zm00001d027703,1.0,L-ornithine N5-acetyltransferase NATA1
2,MQTL1.1,Zm00001d027705,1.0,hypothetical protein ZEAMMB73.Zm00001d027705
3,MQTL1.1,Zm00001d027706,1.0,transducin family protein / WD-40 repeat famil...
4,MQTL1.1,Zm00001d027707,1.0,Cell division control protein 48 homolog D


In [7]:
# load list of core gene IDs as both V5 and V4 genes
cg = pd.read_csv("../../data/All_coregenes_V5-to-V4.txt",sep="\t",header="infer")

In [8]:
# convert this into a dictionary format
cgdict = {}
for g in range(len(list(cg["GeneID_V5"]))):
    k = cg.iloc[g,0]
    v4list = []
    for i in range(1,6):
        if type(cg.iloc[g,i])==str:
            v4list.append(cg.iloc[g,i])
    cgdict[k]=v4list

In [9]:
cgqtl = []
cgqtlv5 = []
for k in cgdict.keys():
    l = cgdict[k]
    for g in l:
        if g in list(mqtl["Gene_name"].unique()):
            cgqtl.append(g)
            cgqtlv5.append(k)

In [10]:
# subset mQTL dataframe
cqtl = mqtl[mqtl["Gene_name"].isin(cgqtl)]
len(cqtl.index)

43

In [11]:
# load tpm data with list of genes
tpm = pd.read_csv("../../data/rawtpm_bptreat_noPEG.tsv",sep="\t",header="infer")
tpm.head()

Unnamed: 0,Sample,BioProject,Treatment,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,PRJNA637522,Drought,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
1,SRR11933272,PRJNA637522,Drought,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
2,SRR11933250,PRJNA637522,Drought,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
3,SRR11933029,PRJNA637522,Control,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
4,SRR11933040,PRJNA637522,Drought,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [12]:
# wrangle tpm data
ttpm = tpm.set_index("Sample").drop(["BioProject","Treatment"],axis=1).transpose().reset_index().rename(columns={"index":"GeneID"})
ttpm.head()

Sample,GeneID,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
0,Zm00001eb000010,12.553818,16.255838,9.028815,8.20134,10.371251,37.430009,39.925873,30.677016,23.393003,...,1.417104,1.923525,1.427602,9.580153,1.2281,2.966207,1.791556,4.286976,3.435711,3.498243
1,Zm00001eb000020,2.321077,3.110372,2.984479,2.385748,2.799099,27.508819,22.44068,24.648455,7.595576,...,0.0,1.799671,0.0,0.0,1.925157,0.561768,0.176413,0.781353,0.379497,0.463832
2,Zm00001eb000050,0.04252,0.405226,0.0,0.0,0.0,0.0,0.0,0.0,0.304751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zm00001eb000060,12.932676,7.214039,3.092442,1.726808,1.280629,29.510498,22.148225,22.170584,14.727189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zm00001eb000070,5.253755,1.902461,2.586555,1.926412,3.771234,7.005587,7.590336,5.274585,2.177748,...,0.0,0.451827,0.0,1.018369,0.0,0.0,0.0,0.0,1.660372,0.748587


In [13]:
# isolate the gene IDs
gid = list(ttpm["GeneID"].unique())

In [14]:
# write a txt file of metaQTL-associated genes to put through Ensembl Plants ID Converter
with open("../../data/metaQTL_genes.txt","w+") as outfile:
    for i in list(mqtl["Gene_name"].unique()):
        outfile.write(i+"\n")

In [15]:
# load ID converter results
idres = pd.read_csv("../../data/Results-Zea_mays_Tools_IDMapper_-1.csv",sep=",",header="infer")
idres.head()

Unnamed: 0,Requested ID,Matched ID(s),Releases
0,Zm00001d052022,Zm00001eb192290,110: Zm00001eb192290.1104: Zm00001eb192290.1
1,Zm00001d005798,Zm00001eb099980,110: Zm00001eb099980.1104: Zm00001eb099980.1
2,Zm00001d021532,Zm00001eb321870,110: Zm00001eb321870.1104: Zm00001eb321870.1
3,Zm00001d027815,Zm00001eb005170,110: Zm00001eb005170.1104: Zm00001eb005170.1
4,Zm00001d022155,Zm00001eb327580,110: Zm00001eb327580.1104: Zm00001eb327580.1


In [16]:
# covert to a dictionary with V5 as key and V4 as value
qtldict = {}
for i in range(len(idres.index)):
    k = idres.iloc[i,1]
    qtldict[k] = idres.iloc[i,0]

In [17]:
iscore = []
ismqtl = []
for g in gid:
    if g in list(cgdict.keys()):
        iscore.append("Y")
    else:
        iscore.append("N")
    if g in list(idres["Matched ID(s)"].unique()):
        ismqtl.append("Y")
    else:
        ismqtl.append("N")
        
# make dataframe
gdf = pd.DataFrame(list(zip(gid,iscore,ismqtl)),columns=["GeneID","isCore","isMQTLhit"])
gdf.head()

Unnamed: 0,GeneID,isCore,isMQTLhit
0,Zm00001eb000010,N,N
1,Zm00001eb000020,N,N
2,Zm00001eb000050,N,N
3,Zm00001eb000060,N,N
4,Zm00001eb000070,N,N


In [19]:
# make a crosstab - contingency table
data = pd.crosstab(index=gdf["isMQTLhit"],columns=gdf["isCore"])
data

isCore,N,Y
isMQTLhit,Unnamed: 1_level_1,Unnamed: 2_level_1
N,36624,821
Y,1521,38


In [22]:
# run Fisher's exact test
odds_ratio, p_value = stats.fisher_exact(data)
print('odd ratio is : ' + str(odds_ratio))
print('p value is : ' + str(p_value))

odd ratio is : 1.1144921164596981
p value is : 0.48167573386593854
