Purpose: To run PCA on the TPM from each module generated by WGCNA. This will feed into random forest as a dimensional reduction method.<br>
Author: Anna Pardo<br>
Date initiated: July 26, 2023

In [4]:
# import modules
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

In [2]:
# load TPM matrix
tpm = pd.read_csv("../../data/rawtpm_bptreat_noPEG.tsv",sep="\t",header="infer")
tpm.head()

Unnamed: 0,Sample,BioProject,Treatment,Zm00001eb000010,Zm00001eb000020,Zm00001eb000050,Zm00001eb000060,Zm00001eb000070,Zm00001eb000080,Zm00001eb000100,...,Zm00001eb442810,Zm00001eb442820,Zm00001eb442840,Zm00001eb442850,Zm00001eb442870,Zm00001eb442890,Zm00001eb442910,Zm00001eb442960,Zm00001eb442980,Zm00001eb443030
0,SRR11933261,PRJNA637522,Drought,12.553818,2.321077,0.04252,12.932676,5.253755,11.105837,0.409268,...,0.171184,0.0,0.0,0.0,0.0,0.309501,0.0,0.0,0.0,0.0
1,SRR11933272,PRJNA637522,Drought,16.255838,3.110372,0.405226,7.214039,1.902461,2.346186,0.170305,...,0.108052,0.127878,0.0,0.0,0.0,6.703281,0.0,0.0,0.0,0.0
2,SRR11933250,PRJNA637522,Drought,9.028815,2.984479,0.0,3.092442,2.586555,16.186141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.417565,0.0,0.254123,0.0,1.213349
3,SRR11933029,PRJNA637522,Control,8.20134,2.385748,0.0,1.726808,1.926412,19.600487,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.370075
4,SRR11933040,PRJNA637522,Drought,10.371251,2.799099,0.0,1.280629,3.771234,19.717683,0.143764,...,0.178304,0.012158,0.0,0.0,0.0,9.625225,0.0,0.0,0.0,2.352959


In [5]:
# set directory name
directory = "../../data/WGCNA_output/outputs_from_hpcc/"

In [9]:
# load lists of genes from WGCNA modules into a dictionary
modgenes = {}
for filename in os.listdir(directory):
    if filename.endswith("_genes.csv"):
        modname = filename.split("_")[0]
        df = pd.read_csv(directory+filename,sep="\t",header="infer")
        #print(df.head())
        genelist = list(df["x"])
        modgenes[modname] = genelist

In [10]:
# need to make a dictionary of TPM dataframes
## transpose tpm dataframe, dropping BioProject and Treatment for now and keeping Sample as index
tpm = tpm.set_index("Sample").drop(["BioProject","Treatment"],axis=1)
ttpm = tpm.transpose()
ttpm.head()

Sample,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,SRR11933475,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
Zm00001eb000010,12.553818,16.255838,9.028815,8.20134,10.371251,37.430009,39.925873,30.677016,23.393003,15.222661,...,1.417104,1.923525,1.427602,9.580153,1.2281,2.966207,1.791556,4.286976,3.435711,3.498243
Zm00001eb000020,2.321077,3.110372,2.984479,2.385748,2.799099,27.508819,22.44068,24.648455,7.595576,3.913437,...,0.0,1.799671,0.0,0.0,1.925157,0.561768,0.176413,0.781353,0.379497,0.463832
Zm00001eb000050,0.04252,0.405226,0.0,0.0,0.0,0.0,0.0,0.0,0.304751,0.334832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000060,12.932676,7.214039,3.092442,1.726808,1.280629,29.510498,22.148225,22.170584,14.727189,13.907885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb000070,5.253755,1.902461,2.586555,1.926412,3.771234,7.005587,7.590336,5.274585,2.177748,1.810991,...,0.0,0.451827,0.0,1.018369,0.0,0.0,0.0,0.0,1.660372,0.748587


In [11]:
# pop out the GeneID column
ttpm = ttpm.reset_index().rename(columns={"index":"GeneID"})
ttpm.head()

Sample,GeneID,SRR11933261,SRR11933272,SRR11933250,SRR11933029,SRR11933040,SRR11932822,SRR11932811,SRR11933230,SRR11932879,...,Ms71D3C,Ki3D1C,CML228D1D,CML333D3D,MO18WD3C,B73D3C,NC358D3C,P39D3D,M162WD3D,M162WD1D
0,Zm00001eb000010,12.553818,16.255838,9.028815,8.20134,10.371251,37.430009,39.925873,30.677016,23.393003,...,1.417104,1.923525,1.427602,9.580153,1.2281,2.966207,1.791556,4.286976,3.435711,3.498243
1,Zm00001eb000020,2.321077,3.110372,2.984479,2.385748,2.799099,27.508819,22.44068,24.648455,7.595576,...,0.0,1.799671,0.0,0.0,1.925157,0.561768,0.176413,0.781353,0.379497,0.463832
2,Zm00001eb000050,0.04252,0.405226,0.0,0.0,0.0,0.0,0.0,0.0,0.304751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Zm00001eb000060,12.932676,7.214039,3.092442,1.726808,1.280629,29.510498,22.148225,22.170584,14.727189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Zm00001eb000070,5.253755,1.902461,2.586555,1.926412,3.771234,7.005587,7.590336,5.274585,2.177748,...,0.0,0.451827,0.0,1.018369,0.0,0.0,0.0,0.0,1.660372,0.748587


In [15]:
# make the dictionary
modtpm = {}
for k in modgenes.keys():
    if k != "grey":
        df = ttpm[ttpm["GeneID"].isin(modgenes[k])]
        modtpm[k] = df