Purpose: Format TPM files for WGCNA.<br>
Author: Anna Pardo<br>
Date initiated: Apr. 1, 2024

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# load corrected log TPM
psyncor = pd.read_csv("../../data/BPcombat_logTPM_psyn_tissues_forRF_1-Mar-2024.tsv",sep="\t",header="infer")
psyncor.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030,Treatment,BioProject
0,SRR11933261,3.676968,1.425305,0.019366,3.454238,2.46477,3.181359,0.480547,-0.043836,1.14804,...,0.004322,-6.9e-05,-0.025771,0.197944,-0.006917,0.030309,-0.000506,0.127112,Drought,PRJNA637522
1,SRR11933272,4.041835,1.692175,0.546391,2.778114,1.452964,1.410374,0.260699,-0.043836,1.765632,...,0.004322,-6.9e-05,-0.025771,2.373048,-0.006917,0.030309,-0.000506,0.127112,Drought,PRJNA637522
2,SRR11933250,3.221863,1.653241,-0.05414,1.886622,1.731918,3.663976,0.074623,-0.043836,1.59469,...,0.004322,-6.9e-05,-0.025771,0.295277,-0.006917,0.303918,-0.000506,1.162604,Drought,PRJNA637522
3,SRR11933029,3.09175,1.449443,-0.05414,1.367097,1.463797,3.913553,0.074623,-0.043836,1.446212,...,0.004322,-6.9e-05,-0.025771,-0.133042,-0.006917,0.030309,-0.000506,1.25177,Control,PRJNA637522
4,SRR11933040,3.411677,1.593612,-0.05414,1.138459,2.108122,3.921366,0.233557,-0.043836,1.758469,...,0.004322,-6.9e-05,-0.025771,2.767788,-0.006917,0.030309,-0.000506,1.703918,Drought,PRJNA637522


In [3]:
allcor = pd.read_csv("../../data/BPcombat_logTPM_forRF_1-Mar-2024.tsv",sep="\t",header="infer")
allcor.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030,BioProject,Treatment
0,SRR11933261,3.594095,2.519998,-0.007294,3.905214,1.951832,3.598633,0.449177,0.114911,0.941737,...,-0.003964,0.000153,-0.023113,0.303027,-0.006205,0.060129,-0.003772,-0.020035,PRJNA637522,Drought
1,SRR11933272,3.89444,2.732825,0.381698,3.24937,1.059086,1.839096,0.24881,0.114911,1.555088,...,-0.003964,0.000153,-0.023113,3.022989,-0.006205,0.060129,-0.003772,-0.020035,PRJNA637522,Drought
2,SRR11933250,3.219469,2.701776,-0.061548,2.384618,1.305216,4.078129,0.07922,0.114911,1.38532,...,-0.003964,0.000153,-0.023113,0.424741,-0.006205,0.332574,-0.003772,1.026435,PRJNA637522,Drought
3,SRR11933029,3.112365,2.539248,-0.061548,1.880675,1.068644,4.326093,0.07922,0.114911,1.237862,...,-0.003964,0.000153,-0.023113,-0.110871,-0.006205,0.060129,-0.003772,1.116546,PRJNA637522,Control
4,SRR11933040,3.375717,2.654222,-0.061548,1.658896,1.637151,4.333856,0.224072,0.114911,1.547974,...,-0.003964,0.000153,-0.023113,3.516611,-0.006205,0.060129,-0.003772,1.573487,PRJNA637522,Drought


In [4]:
# replace DroughtRepeat with Drought
allcor["Treatment"].mask(allcor["Treatment"]=="DroughtRepeat","Drought",inplace=True)
psyncor["Treatment"].mask(psyncor["Treatment"]=="DroughtRepeat","Drought",inplace=True)

In [5]:
# set Sample as index & drop BioProject and Treatment columns
bpall_log = allcor.set_index("Sample").drop(["BioProject","Treatment"],axis=1)
bpsyn_log = psyncor.set_index("Sample").drop(["BioProject","Treatment"],axis=1)

In [6]:
# undo the log2 transformation
bpall = bpall_log.apply(lambda x: (2**x)-1)

In [7]:
bpsyn = bpsyn_log.apply(lambda x: (2**x)-1)

In [8]:
# load core genes lists
psyn_cg_info = pd.read_csv("../../data/core_genes_psyn_info_30-Mar-2024.tsv",sep="\t",header="infer")
all_cg_info = pd.read_csv("../../data/core_genes_all_info_30-Mar-2024.tsv",sep="\t",header="infer")

In [9]:
# transpose dataframes
bpallt = bpall.transpose().reset_index().rename(columns={"index":"GeneID"})
bpsyntt = bpsyn.transpose().reset_index().rename(columns={"index":"GeneID"})

In [12]:
# save bpallt and bpsynt
bpallt.to_csv("../../data/BPcombat_TPM_allsamp_WGCNA.tsv",sep="\t",header=True,index=False)
bpsyntt.to_csv("../../data/BPcombat_TPM_psyntissues_WGCNA.tsv",sep="\t",header=True,index=False)

In [13]:
bpsyntt.head()

Sample,GeneID,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
0,Zm00001eb000010,11.790213,15.470756,8.329908,7.525298,9.641847,37.097675,39.692909,30.117299,22.667614,...,7.396586,8.390767,7.418021,19.012579,7.003887,10.236449,8.139028,12.306526,11.000854,11.100092
1,Zm00001eb000020,1.685713,2.231434,2.145396,1.731026,2.018041,16.34113,13.632856,14.82129,5.128416,...,0.546177,4.394101,0.546177,0.546177,4.688934,1.65616,0.883209,2.116003,1.284758,1.455377
2,Zm00001eb000050,0.013514,0.460427,-0.036832,-0.036832,-0.036832,-0.036832,-0.036832,-0.036832,0.333701,...,0.167602,0.167602,0.167602,0.167602,0.167602,0.167602,0.167602,0.167602,0.167602,0.167602
3,Zm00001eb000060,9.960474,5.859552,2.697684,1.579509,1.201458,20.966336,16.194328,16.209057,11.20386,...,5.449363,5.449363,5.449363,5.449363,5.449363,5.449363,5.449363,5.449363,5.449363,5.449363
4,Zm00001eb000070,4.520388,1.7377,2.321692,1.758333,3.311296,5.917663,6.377891,4.537185,1.973996,...,1.138125,2.779467,1.138125,5.252562,1.138125,1.138125,1.138125,1.138125,8.535047,4.021651


In [14]:
# subset to just core genes
bpsyn_core = bpsyntt[bpsyntt["GeneID"].isin(list(psyn_cg_info["GeneID"].unique()))]

In [15]:
bpall_core = bpallt[bpallt["GeneID"].isin(list(all_cg_info["GeneID"].unique()))]

In [16]:
bpall_core.head()

Sample,GeneID,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
192,Zm00001eb002180,27.358806,23.628469,17.077853,7.486076,11.193371,5.465076,4.959726,3.754283,49.83846,...,28.288995,13.192527,59.114059,32.307182,9.897771,15.339723,8.468196,53.206111,46.325966,25.647938
200,Zm00001eb002270,115.443475,655.980132,235.618335,326.659903,129.208929,198.615136,596.873895,393.75822,196.602598,...,274.062747,278.463392,232.42384,62.020463,310.641977,260.056856,209.640569,86.086054,45.665955,352.904684
201,Zm00001eb002280,2.999069,4.058413,7.757464,21.431672,5.889466,3.49155,7.241103,9.429843,1.692326,...,201.55751,1.77863,0.516377,5.582236,0.516377,0.516377,6.030679,1.60268,0.516377,6.038208
243,Zm00001eb002760,0.073183,0.18785,2.142044,4.818136,0.5533,5.476585,3.901386,2.133869,0.480287,...,1.193351,1.193351,1.193351,1.193351,1.729543,6.755235,1.632834,1.193351,1.193351,1.193351
262,Zm00001eb002950,46.091991,2.349171,76.22375,65.657682,9.566306,147.703008,34.760385,42.172344,14.160471,...,12.798931,12.798931,33.734127,15.898042,15.550946,45.562139,17.213429,34.010404,12.798931,12.798931


In [17]:
bpsyn_core.to_csv("../../data/BPcombat_TPM_coregenes_psyntissues_WGCNA.tsv",sep="\t",header=True,index=False)
bpall_core.to_csv("../../data/BPcombat_TPM_coregenes_allsamp_WGCNA.tsv",sep="\t",header=True,index=False)

In [18]:
bpall.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,11.076204,4.735813,-0.005043,13.982582,2.868655,11.114244,0.365261,0.082908,0.92084,-0.001095,...,0.123565,0.546069,-0.002744,0.000106,-0.015893,0.23373,-0.004292,0.042559,-0.002611,-0.013791
SRR11933272,13.871106,5.647562,0.302875,8.509506,1.083612,2.577858,0.188226,0.082908,1.938516,-0.001095,...,0.079462,0.661707,-0.002744,0.000106,-0.015893,7.1285,-0.004292,0.042559,-0.002611,-0.013791
SRR11933250,8.314438,5.506025,-0.041765,4.222056,1.471207,15.890374,0.056447,0.082908,1.612299,-0.001095,...,0.002318,0.546069,-0.002744,0.000106,-0.015893,0.342332,-0.004292,0.259258,-0.002611,1.036984
SRR11933029,7.64799,4.812859,-0.041765,2.682474,1.097461,19.057821,0.056447,0.082908,1.358487,-0.001095,...,0.002318,0.546069,-0.002744,0.000106,-0.015893,-0.073971,-0.004292,0.042559,-0.002611,1.168272
SRR11933040,9.379871,5.29507,-0.041765,2.157748,2.110509,19.166035,0.168026,0.082908,1.924061,-0.001095,...,0.128498,0.557309,-0.002744,0.000106,-0.015893,10.444725,-0.004292,0.042559,-0.002611,1.976233


In [20]:
# save bpall
bpall.to_csv("../../data/BPcombat_TPM_forWGCNA_allgenes_allsamp.tsv",sep="\t",header=True,index=True)

In [21]:
type(bpall.iloc[0,0])

numpy.float64