Purpose: Correct the full dataset for BioProject batch effects using pyComBat.<br>
Author: Anna Pardo<br>
Date initiated: March 1, 2024

In [1]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from combat.pycombat import pycombat

In [2]:
# load TPM matrix
tpm = pd.read_csv("../../data/rawtpm_bptreat_noPEG.tsv",sep="\t",header="infer")
tpm.head()

Unnamed: 0,Sample,BioProject,Treatment,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,PRJNA637522,Drought,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
1,SRR11933272,PRJNA637522,Drought,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
2,SRR11933250,PRJNA637522,Drought,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
3,SRR11933029,PRJNA637522,Control,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
4,SRR11933040,PRJNA637522,Drought,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [3]:
# define a function from an answer in https://stackoverflow.com/questions/39812885/retain-feature-names-after-scikit-feature-selection
def variance_threshold_selector(data):
    selector = VarianceThreshold()
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [4]:
# set Sample as index and drop BioProject and Treatment columns
ttpm = tpm.set_index("Sample").drop(["BioProject","Treatment"],axis=1)
ttpm.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,0.0,1.122904,0.0,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
SRR11933272,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,0.0,2.280915,0.0,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
SRR11933250,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,0.0,1.908468,0.0,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
SRR11933029,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,0.0,1.619452,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
SRR11933040,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,0.0,2.26439,0.0,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [5]:
# put through VarianceThreshold() using the variance_threshold_selector() defined above
vttpm = variance_threshold_selector(ttpm)
vttpm.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,0.0,1.122904,0.0,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
SRR11933272,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,0.0,2.280915,0.0,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
SRR11933250,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,0.0,1.908468,0.0,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
SRR11933029,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,0.0,1.619452,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
SRR11933040,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,0.0,2.26439,0.0,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [6]:
# log-transform TPM
vttpm_log = vttpm.apply(lambda x: np.log2(x+1))
vttpm_log.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,3.760627,1.731651,0.060075,3.8004,2.644723,3.597631,0.494946,0.0,1.086039,0.0,...,0.227968,0.0,0.0,0.0,0.0,0.389017,0.0,0.0,0.0,0.0
SRR11933272,4.109013,2.039269,0.490802,3.038092,1.537277,1.742518,0.226885,0.0,1.714098,0.0,...,0.148026,0.173611,0.0,0.0,0.0,2.945473,0.0,0.0,0.0,0.0
SRR11933250,3.326079,1.994391,0.0,2.032962,1.842599,4.103174,0.0,0.0,1.540259,0.0,...,0.0,0.0,0.0,0.0,0.0,0.503415,0.0,0.326679,0.0,1.146231
SRR11933029,3.201844,1.759475,0.0,1.447213,1.549133,4.364607,0.0,0.0,1.389265,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.244933
SRR11933040,3.507319,1.925657,0.0,1.189432,2.254362,4.372791,0.193789,0.0,1.706813,0.0,...,0.236712,0.017435,0.0,0.0,0.0,3.409421,0.0,0.0,0.0,1.745435


In [2]:
# Load metadata
md = pd.read_csv("../../data/metadata_corrected_14-Feb-2024.csv",sep=",",header="infer")
md.head()

Unnamed: 0.1,Unnamed: 0,BioProject,Sample,sample_name,Replicate_num,Genotype,Technology,Library_layout,Treatment,Duration_hours,...,Developmental_stage,Tissue,Day_length_hours,Day_temp_C,Night_temp_C,Relative humidity (%),Light (umol/m2/s),Growth Env.,Media,Notes
0,0,PRJNA637522,SRR11933261,238_WS2,,238,Illumina HiSeq 2500,PAIRED,Drought,312.0,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
1,1,PRJNA637522,SRR11933272,238_WS1,,238,Illumina HiSeq 2500,PAIRED,Drought,216.0,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
2,2,PRJNA637522,SRR11933250,268_WS1,,268,Illumina HiSeq 2500,PAIRED,Drought,216.0,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
3,3,PRJNA637522,SRR11933029,268_WW,,268,Illumina HiSeq 2500,PAIRED,Control,216.0,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,
4,4,PRJNA637522,SRR11933040,268_WS2,,268,Illumina HiSeq 2500,PAIRED,Drought,312.0,...,V3,Leaf,16.0,25.0,,,,Greenhouse,,


In [8]:
# reset index & merge with metadata
comtpm = vttpm_log.reset_index().rename(columns={"index":"Sample"})
mdcomtpm = comtpm.merge(md[["Sample","BioProject"]],how="inner",on="Sample")
mdcomtpm.set_index("Sample",inplace=True)

In [9]:
# generate a list of dataframes each with a single BioProject
dflist = []
for i in range(len(mdcomtpm["BioProject"].unique())):
    t = mdcomtpm["BioProject"].unique()[i]
    dflist.append(mdcomtpm[mdcomtpm["BioProject"]==t])
dflist[0].head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030,BioProject
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,3.760627,1.731651,0.060075,3.8004,2.644723,3.597631,0.494946,0.0,1.086039,0.0,...,0.0,0.0,0.0,0.0,0.389017,0.0,0.0,0.0,0.0,PRJNA637522
SRR11933272,4.109013,2.039269,0.490802,3.038092,1.537277,1.742518,0.226885,0.0,1.714098,0.0,...,0.173611,0.0,0.0,0.0,2.945473,0.0,0.0,0.0,0.0,PRJNA637522
SRR11933250,3.326079,1.994391,0.0,2.032962,1.842599,4.103174,0.0,0.0,1.540259,0.0,...,0.0,0.0,0.0,0.0,0.503415,0.0,0.326679,0.0,1.146231,PRJNA637522
SRR11933029,3.201844,1.759475,0.0,1.447213,1.549133,4.364607,0.0,0.0,1.389265,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.244933,PRJNA637522
SRR11933040,3.507319,1.925657,0.0,1.189432,2.254362,4.372791,0.193789,0.0,1.706813,0.0,...,0.017435,0.0,0.0,0.0,3.409421,0.0,0.0,0.0,1.745435,PRJNA637522


In [10]:
# for each dataframe in dflist: remove the columns not beginning with Zm, then transpose the dataframe
dl2 = []
for df in dflist:
    filtered_columns = [col for col in df.columns if col.startswith("Zm")]
    df2 = df[filtered_columns]
    dl2.append(df2.transpose())

In [11]:
# generate the batch variable for BioProject
batch = []
for j in range(len(dl2)):
    batch.extend([j for _ in range(len(dl2[j].columns))])

In [12]:
# go back to vttpm_log and re-transpose it
vtpm_log = vttpm_log.transpose()
vtpm_log.head()

Sample,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,SRR11933475,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
Zm00001eb000010,3.760627,4.109013,3.326079,3.201844,3.507319,5.264161,5.354941,4.985365,4.608395,4.019939,...,1.27328,1.547709,1.279532,3.403289,1.155814,1.98776,1.481069,2.402443,2.149165,2.169362
Zm00001eb000020,1.731651,2.039269,1.994391,1.759475,1.925657,4.833336,4.550943,4.6808,3.103594,2.296733,...,0.0,1.485257,0.0,0.0,1.548514,0.64318,0.234395,0.832973,0.464142,0.54975
Zm00001eb000050,0.060075,0.490802,0.0,0.0,0.0,0.0,0.0,0.0,0.383775,0.416658,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000060,3.8004,3.038092,2.032962,1.447213,1.189432,4.931234,4.53283,4.534223,3.975189,3.898004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000070,2.644723,1.537277,1.842599,1.549133,2.254362,3.001007,3.102715,2.64952,1.668005,1.491079,...,0.0,0.53787,0.0,1.01319,0.0,0.0,0.0,0.0,1.411628,0.80619


In [13]:
# run pyComBat for BioProject
bpcor_logtpm = pycombat(vtpm_log,batch)

Found 39 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.


  np.absolute(d_new-d_old)/d_old))  # maximum difference between new and old estimate


Adjusting the Data


In [14]:
bplt = bpcor_logtpm.transpose().reset_index()
bplt.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,3.594095,2.519998,-0.007294,3.905214,1.951832,3.598633,0.449177,0.114911,0.941737,...,0.168084,0.628605,-0.003964,0.000153,-0.023113,0.303027,-0.006205,0.060129,-0.003772,-0.020035
1,SRR11933272,3.89444,2.732825,0.381698,3.24937,1.059086,1.839096,0.24881,0.114911,1.555088,...,0.110313,0.732666,-0.003964,0.000153,-0.023113,3.022989,-0.006205,0.060129,-0.003772,-0.020035
2,SRR11933250,3.219469,2.701776,-0.061548,2.384618,1.305216,4.078129,0.07922,0.114911,1.38532,...,0.00334,0.628605,-0.003964,0.000153,-0.023113,0.424741,-0.006205,0.332574,-0.003772,1.026435
3,SRR11933029,3.112365,2.539248,-0.061548,1.880675,1.068644,4.326093,0.07922,0.114911,1.237862,...,0.00334,0.628605,-0.003964,0.000153,-0.023113,-0.110871,-0.006205,0.060129,-0.003772,1.116546
4,SRR11933040,3.375717,2.654222,-0.061548,1.658896,1.637151,4.333856,0.224072,0.114911,1.547974,...,0.174403,0.639055,-0.003964,0.000153,-0.023113,3.516611,-0.006205,0.060129,-0.003772,1.573487


In [15]:
# save bplt
bplt.to_csv("./BioProject_corrected_logTPM_all.tsv",sep="\t",header=True,index=False)

New section: try running pyComBat before log transformation (but after variance thresholding). Does this make a difference in the PCA?

In [6]:
vttpm.reset_index().head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,0.0,1.122904,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
1,SRR11933272,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,0.0,2.280915,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
2,SRR11933250,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,0.0,1.908468,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
3,SRR11933029,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,0.0,1.619452,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
4,SRR11933040,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,0.0,2.26439,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [10]:
# reset index & merge with metadata
comtpm2 = vttpm.reset_index()
mdcomtpm2 = comtpm2.merge(md[["Sample","BioProject"]],how="inner",on="Sample")
mdcomtpm2.set_index("Sample",inplace=True)

In [11]:
# generate a list of dataframes each with a single BioProject
dflist = []
for i in range(len(mdcomtpm2["BioProject"].unique())):
    t = mdcomtpm2["BioProject"].unique()[i]
    dflist.append(mdcomtpm2[mdcomtpm2["BioProject"]==t])
dflist[0].head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030,BioProject
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,0.0,1.122904,0.0,...,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0,PRJNA637522
SRR11933272,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,0.0,2.280915,0.0,...,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0,PRJNA637522
SRR11933250,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,0.0,1.908468,0.0,...,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349,PRJNA637522
SRR11933029,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,0.0,1.619452,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075,PRJNA637522
SRR11933040,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,0.0,2.26439,0.0,...,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959,PRJNA637522


In [12]:
# for each dataframe in dflist: remove the columns not beginning with Zm, then transpose the dataframe
dl2 = []
for df in dflist:
    filtered_columns = [col for col in df.columns if col.startswith("Zm")]
    df2 = df[filtered_columns]
    dl2.append(df2.transpose())

In [13]:
# generate the batch variable for BioProject
batch = []
for j in range(len(dl2)):
    batch.extend([j for _ in range(len(dl2[j].columns))])

In [14]:
# go back to vttpm and re-transpose it
vtpm = vttpm.transpose()
vtpm.head()

Sample,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,SRR11933475,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
Zm00001eb000010,12.553818,16.255838,9.028815,8.20134,10.371251,37.430009,39.925873,30.677016,23.393003,15.222661,...,1.417104,1.923525,1.427602,9.580153,1.2281,2.966207,1.791556,4.286976,3.435711,3.498243
Zm00001eb000020,2.321077,3.110372,2.984479,2.385748,2.799099,27.508819,22.44068,24.648455,7.595576,3.913437,...,0.0,1.799671,0.0,0.0,1.925157,0.561768,0.176413,0.781353,0.379497,0.463832
Zm00001eb000050,0.04252,0.405226,0.0,0.0,0.0,0.0,0.0,0.0,0.304751,0.334832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000060,12.932676,7.214039,3.092442,1.726808,1.280629,29.510498,22.148225,22.170584,14.727189,13.907885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000070,5.253755,1.902461,2.586555,1.926412,3.771234,7.005587,7.590336,5.274585,2.177748,1.810991,...,0.0,0.451827,0.0,1.018369,0.0,0.0,0.0,0.0,1.660372,0.748587


In [15]:
# run pyComBat for BioProject
bpcor_tpm = pycombat(vtpm,batch)

Found 39 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.


  np.absolute(d_new-d_old)/d_old))  # maximum difference between new and old estimate


Adjusting the Data


In [16]:
bpcor_tpm.head()

Sample,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,SRR11933475,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
Zm00001eb000010,11.893507,15.369406,8.583813,7.806881,9.844253,35.250248,37.593663,28.909731,22.070629,14.399336,...,8.134132,8.654233,8.144914,16.517693,7.940023,9.725082,8.518699,11.081529,10.207269,10.27149
Zm00001eb000020,48.043564,48.466473,48.399019,48.078215,48.299691,61.539335,58.82379,60.006732,50.869678,48.896761,...,57.173127,58.173511,57.173127,57.173127,58.243265,57.485397,57.27119,57.607458,57.384078,57.430957
Zm00001eb000050,-0.195001,0.297977,-0.252793,-0.252793,-0.252793,-0.252793,-0.252793,-0.252793,0.161414,0.2023,...,0.333643,0.333643,0.333643,0.333643,0.333643,0.333643,0.333643,0.333643,0.333643,0.333643
Zm00001eb000060,17.979038,13.825805,10.832442,9.840633,9.51659,30.018893,24.671948,24.688187,19.282326,18.687296,...,20.554886,20.554886,20.554886,20.554886,20.554886,20.554886,20.554886,20.554886,20.554886,20.554886
Zm00001eb000070,3.645409,0.855161,1.42473,0.875102,2.41108,5.103964,5.590819,3.662752,1.084362,0.779004,...,0.953879,2.558492,0.953879,4.570502,0.953879,0.953879,0.953879,0.953879,6.850504,3.612402


In [17]:
bpt = bpcor_tpm.transpose().reset_index()
bpt.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,11.893507,48.043564,-0.195001,17.979038,3.645409,19.80385,0.347005,0.034227,1.007776,...,0.113725,1.592809,-0.003521,6e-05,-0.031864,-0.643666,-0.049156,0.083511,-0.003332,-0.005863
1,SRR11933272,15.369406,48.466473,0.297977,13.825805,0.855161,12.780552,0.185579,0.034227,2.347078,...,0.074353,1.666817,-0.003521,6e-05,-0.031864,7.956798,-0.049156,0.083511,-0.003332,-0.005863
2,SRR11933250,8.583813,48.399019,-0.252793,10.832442,1.42473,23.877127,0.070533,0.034227,1.916323,...,0.006965,1.592809,-0.003521,6e-05,-0.031864,-0.498306,-0.049156,0.241908,-0.003332,1.133667
3,SRR11933029,7.806881,48.078215,-0.252793,9.840633,0.875102,26.614675,0.070533,0.034227,1.582061,...,0.006965,1.592809,-0.003521,6e-05,-0.031864,-1.059985,-0.049156,0.083511,-0.003332,1.280858
4,SRR11933040,9.844253,48.299691,-0.252793,9.51659,2.41108,26.70864,0.167649,0.034227,2.327966,...,0.118166,1.599846,-0.003521,6e-05,-0.031864,11.887192,-0.049156,0.083511,-0.003332,2.203945


In [18]:
# save bpt, just in case
bpt.to_csv("./BioProject_corrected_rawTPM_all.tsv",sep="\t",header=True,index=False)

In [3]:
# run the PCA for both datasets
## re-load bplt
bplt = pd.read_csv("./BioProject_corrected_logTPM_all.tsv",sep="\t",header="infer")

In [20]:
# log transform bpt
bpt = bpt.set_index("Sample")
bpt_log = bpt.apply(lambda x: np.log2(x+1))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [22]:
bpt.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,11.893507,48.043564,-0.195001,17.979038,3.645409,19.80385,0.347005,0.034227,1.007776,-0.011901,...,0.113725,1.592809,-0.003521,6e-05,-0.031864,-0.643666,-0.049156,0.083511,-0.003332,-0.005863
SRR11933272,15.369406,48.466473,0.297977,13.825805,0.855161,12.780552,0.185579,0.034227,2.347078,-0.011901,...,0.074353,1.666817,-0.003521,6e-05,-0.031864,7.956798,-0.049156,0.083511,-0.003332,-0.005863
SRR11933250,8.583813,48.399019,-0.252793,10.832442,1.42473,23.877127,0.070533,0.034227,1.916323,-0.011901,...,0.006965,1.592809,-0.003521,6e-05,-0.031864,-0.498306,-0.049156,0.241908,-0.003332,1.133667
SRR11933029,7.806881,48.078215,-0.252793,9.840633,0.875102,26.614675,0.070533,0.034227,1.582061,-0.011901,...,0.006965,1.592809,-0.003521,6e-05,-0.031864,-1.059985,-0.049156,0.083511,-0.003332,1.280858
SRR11933040,9.844253,48.299691,-0.252793,9.51659,2.41108,26.70864,0.167649,0.034227,2.327966,-0.011901,...,0.118166,1.599846,-0.003521,6e-05,-0.031864,11.887192,-0.049156,0.083511,-0.003332,2.203945


In [23]:
bpt_log.head()

Unnamed: 0_level_0,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,Zm00001eb000140,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR11933261,3.688573,5.615992,-0.312941,4.246335,2.215806,4.378779,0.429755,0.048553,1.005599,-0.017273,...,0.155393,1.374516,-0.005088,8.7e-05,-0.046719,-1.488698,-0.072719,0.115713,-0.004815,-0.008484
SRR11933272,4.03293,5.628379,0.376264,3.890039,0.891544,3.784562,0.245591,0.048553,1.742902,-0.017273,...,0.103468,1.415119,-0.005088,8.7e-05,-0.046719,3.162983,-0.072719,0.115713,-0.004815,-0.008484
SRR11933250,3.2606,5.62641,-0.42042,3.564676,1.277824,4.636748,0.098329,0.048553,1.544151,-0.017273,...,0.010014,1.374516,-0.005088,8.7e-05,-0.046719,-0.99512,-0.072719,0.312558,-0.004815,1.093335
SRR11933029,3.138631,5.617011,-0.42042,3.438377,0.906969,4.787363,0.098329,0.048553,1.368523,-0.017273,...,0.010014,1.374516,-0.005088,8.7e-05,-0.046719,,-0.072719,0.115713,-0.004815,1.189577
SRR11933040,3.438859,5.623507,-0.42042,3.394595,1.770228,4.792264,0.223607,0.048553,1.734641,-0.017273,...,0.161134,1.378426,-0.005088,8.7e-05,-0.046719,3.687866,-0.072719,0.115713,-0.004815,1.679849


In [21]:
# run PCA for bpt_log
pca=PCA(n_components=2)
real_PCs = pca.fit_transform(bpt_log)
real_PCs_df = pd.DataFrame(data = real_PCs, columns = ['PC1','PC2'])

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Moral of story: Use `bplt` instead of `bpt` - the latter has negative values some of which introduce NaNs when log transformed.

In [4]:
# add treatment & BioProject cols to bplt to make it ready for RF input
bplt_md = bplt.merge(md[["Sample","BioProject","Treatment"]])
bplt_md.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030,BioProject,Treatment
0,SRR11933261,3.594095,2.519998,-0.007294,3.905214,1.951832,3.598633,0.449177,0.114911,0.941737,...,-0.003964,0.000153,-0.023113,0.303027,-0.006205,0.060129,-0.003772,-0.020035,PRJNA637522,Drought
1,SRR11933272,3.89444,2.732825,0.381698,3.24937,1.059086,1.839096,0.24881,0.114911,1.555088,...,-0.003964,0.000153,-0.023113,3.022989,-0.006205,0.060129,-0.003772,-0.020035,PRJNA637522,Drought
2,SRR11933250,3.219469,2.701776,-0.061548,2.384618,1.305216,4.078129,0.07922,0.114911,1.38532,...,-0.003964,0.000153,-0.023113,0.424741,-0.006205,0.332574,-0.003772,1.026435,PRJNA637522,Drought
3,SRR11933029,3.112365,2.539248,-0.061548,1.880675,1.068644,4.326093,0.07922,0.114911,1.237862,...,-0.003964,0.000153,-0.023113,-0.110871,-0.006205,0.060129,-0.003772,1.116546,PRJNA637522,Control
4,SRR11933040,3.375717,2.654222,-0.061548,1.658896,1.637151,4.333856,0.224072,0.114911,1.547974,...,-0.003964,0.000153,-0.023113,3.516611,-0.006205,0.060129,-0.003772,1.573487,PRJNA637522,Drought


In [5]:
# save updated dataframe
bplt_md.to_csv("../../data/BPcombat_logTPM_forRF_1-Mar-2024.tsv",sep="\t",header=True,index=False)

In [6]:
# check that the photosynthetic tissues data are correctly set up as well
psyn = pd.read_csv("../../data/psyn_tissues_tpm_combatBioProject.tsv",sep="\t",header="infer")
psyn.head()

Unnamed: 0,Sample,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,Zm00001eb000110,Zm00001eb000120,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,3.676968,1.425305,0.019366,3.454238,2.46477,3.181359,0.480547,-0.043836,1.14804,...,0.197235,-0.007364,0.004322,-6.9e-05,-0.025771,0.197944,-0.006917,0.030309,-0.000506,0.127112
1,SRR11933272,4.041835,1.692175,0.546391,2.778114,1.452964,1.410374,0.260699,-0.043836,1.765632,...,0.130296,0.118151,0.004322,-6.9e-05,-0.025771,2.373048,-0.006917,0.030309,-0.000506,0.127112
2,SRR11933250,3.221863,1.653241,-0.05414,1.886622,1.731918,3.663976,0.074623,-0.043836,1.59469,...,0.006349,-0.007364,0.004322,-6.9e-05,-0.025771,0.295277,-0.006917,0.303918,-0.000506,1.162604
3,SRR11933029,3.09175,1.449443,-0.05414,1.367097,1.463797,3.913553,0.074623,-0.043836,1.446212,...,0.006349,-0.007364,0.004322,-6.9e-05,-0.025771,-0.133042,-0.006917,0.030309,-0.000506,1.25177
4,SRR11933040,3.411677,1.593612,-0.05414,1.138459,2.108122,3.921366,0.233557,-0.043836,1.758469,...,0.204556,0.00524,0.004322,-6.9e-05,-0.025771,2.767788,-0.006917,0.030309,-0.000506,1.703918


In [7]:
psyn = psyn.merge(md[["Sample","Treatment","BioProject"]])

In [8]:
psyn.to_csv("../../data/BPcombat_logTPM_psyn_tissues_forRF_1-Mar-2024.tsv",sep="\t",header=True,index=False)