In [48]:
import pandas as pd
import qnorm
import numpy as np
from sklearn.preprocessing import QuantileTransformer

#### gene expression data for tumor samples

In [39]:
cancer_type = "CRC"

In [None]:
gene_exp = pd.read_csv("../gene_exp/" + cancer_type + "_gene_tpm.tsv", sep = "\t")
gene_exp.index = gene_exp["gene_id"]
gene_exp.drop(columns = "gene_id", inplace = True)

samples = pd.DataFrame({"sample_id" : gene_exp.columns[1:]})
samples["sample_type"] = [i.split("-")[3][:-1] for i in samples["sample_id"]]
samples["patient"] = ["-".join(i.split("-")[:3]) for i in samples["sample_id"]]
samples["plate"] = [i.split("-")[5] for i in samples["sample_id"]]
# select only tumor samples
tumor_samples = samples.loc[samples["sample_type"] == "01",]
# select most recent plate
tumor_samples = tumor_samples.sort_values(by = "plate", ascending=False) \
            .drop_duplicates(subset = "patient", keep = "first")

gene_tumor = gene_exp[tumor_samples["sample_id"]]
gene_tumor.columns = tumor_samples["patient"]
gene_tumor = gene_tumor.loc[gene_tumor.median(axis=1) > 0]
gene_tumor.shape

In [69]:
gene_tumor.to_csv("crc_tumor.tsv", sep = "\t")

### gene expression changes for paired samples

In [41]:
pairs = pd.read_csv("./output/crc_gene_qtl.csv").query("adj_p < 0.05")
id_name = pd.read_csv("../gene_exp/gene_id_name.tsv", sep = "\t")
gene_list = pairs.merge(id_name, left_on = "gene", right_on = "gene_name")["gene_id"].unique()

In [42]:
solid_samples = samples.loc[samples["sample_type"] == "11",]
solid_samples = solid_samples.sort_values(by = "plate", ascending=False) \
            .drop_duplicates(subset = "patient", keep = "first")
paired = np.intersect1d(solid_samples["patient"], tumor_samples["patient"])

In [43]:
solid_gene = gene_exp.loc[gene_list, solid_samples.loc[solid_samples["patient"].isin(paired), "sample_id"]]
tumor_gene = gene_exp.loc[gene_list, tumor_samples.loc[tumor_samples["patient"].isin(paired), "sample_id"]]
solid_gene.columns = ["-".join(i.split("-")[:3]) for i in solid_gene.columns]
tumor_gene.columns = ["-".join(i.split("-")[:3]) for i in tumor_gene.columns]

In [44]:
solid_gene.shape, tumor_gene.shape

((100, 21), (100, 21))