In [1]:
import os
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import getExternalDataDirectory as getMutperiodExternalDataDirectory
import pandas as pd

# Set up data directories
mutperiodHg19Directory = os.path.join(getMutperiodExternalDataDirectory(), "hg19")
A549GeneExpressionDirectory = os.path.join(mutperiodHg19Directory, "A549_gene_expression")
GM12878GeneExpressionDirectory = os.path.join(mutperiodHg19Directory, "GM12878_gene_expression")
TSSBaseDirectory = os.path.join(mutperiodHg19Directory, "hg19_protein_coding_genes_TSSs")
A549HighExpressionTSSDirectory = os.path.join(mutperiodHg19Directory, "hg19_protein_coding_genes_TSSs_A549_high_expression")
A549LowExpressionTSSDirectory = os.path.join(mutperiodHg19Directory, "hg19_protein_coding_genes_TSSs_A549_low_expression")
GM12878HighExpressionTSSDirectory = os.path.join(mutperiodHg19Directory, "hg19_protein_coding_genes_TSSs_GM12878_high_expression")
GM12878LowExpressionTSSDirectory = os.path.join(mutperiodHg19Directory, "hg19_protein_coding_genes_TSSs_GM12878_low_expression")

### A549 Data

In [2]:
# Read in raw data 
A549ExpressionData = pd.read_csv(os.path.join(A549GeneExpressionDirectory, "A549_expression_DepMap_raw.csv"), header = None).drop(columns=range(0,6)).T
A549ExpressionData.columns = ["Gene_Name", "Expression"]

# Split the gene name and ID. (I think the number is an ID? But I'm not sure what kind...)
A549ExpressionData.insert(1,"Gene_ID",A549ExpressionData["Gene_Name"].apply(lambda x: x.split()[1][1:-1]))
A549ExpressionData["Gene_Name"] = A549ExpressionData["Gene_Name"].apply(lambda x: x.split()[0])
A549ExpressionData = A549ExpressionData.sort_values("Expression", ascending=False).reset_index(drop = True)

# Write the new expression data to new files, including one for the top and bottom quartiles
A549ExpressionData.to_csv(os.path.join(A549GeneExpressionDirectory, "A549_expression_DepMap.tsv"), sep = '\t', index = False)
A549QuartileCutoff = len(A549ExpressionData)//4
A549HighExpressionData = A549ExpressionData[:A549QuartileCutoff]
A549HighExpressionData.to_csv(os.path.join(A549GeneExpressionDirectory, "A549_expression_DepMap_top_quartile.tsv"),
                               sep = '\t', index = False)
A549LowExpressionData = A549ExpressionData[-A549QuartileCutoff:]
A549LowExpressionData.to_csv(os.path.join(A549GeneExpressionDirectory, "A549_expression_DepMap_bottom_quartile.tsv"),
                              sep = '\t', index = False)

In [3]:
# Subset base TSSs
baseTSSs = pd.read_table(os.path.join(TSSBaseDirectory, "hg19_protein_coding_genes_TSSs.bed"), header = None)
A549HighExpressionTSSs = baseTSSs[baseTSSs[4].isin(A549HighExpressionData["Gene_Name"])].reset_index(drop = True)
A549lowExpressionTSSs = baseTSSs[baseTSSs[4].isin(A549LowExpressionData["Gene_Name"])].reset_index(drop = True)

# Write new TSSs
A549HighExpressionTSSs.to_csv(os.path.join(A549HighExpressionTSSDirectory, "hg19_protein_coding_genes_TSSs_A549_high_expression.bed"),
                         sep = '\t', index = False, header = False)
A549lowExpressionTSSs.to_csv(os.path.join(A549LowExpressionTSSDirectory, "hg19_protein_coding_genes_TSSs_A549_low_expression.bed"),
                         sep = '\t', index = False, header = False)

### GM12878 Data

In [4]:
# Parse raw data 
GM12878ExpressionData = pd.read_table(os.path.join(GM12878GeneExpressionDirectory, "57epigenomes.RPKM.pc"), index_col=False)[["gene_id","E116"]]
GM12878ExpressionData.columns = ["Gene_ID", "Expression"]
GM12878ExpressionData = GM12878ExpressionData.sort_values("Expression", ascending=False).reset_index(drop = True)

# Write the expression data to new files, including one for the top and bottom quartiles
GM12878ExpressionData.to_csv(os.path.join(GM12878GeneExpressionDirectory, "GM12878_expression.tsv"), sep = '\t', index = False)
GM12878QuartileCutoff = len(GM12878ExpressionData)//4
GM12878HighExpressionData = GM12878ExpressionData[:GM12878QuartileCutoff]
GM12878HighExpressionData.to_csv(os.path.join(GM12878GeneExpressionDirectory, "GM12878_expression_top_quartile.tsv"),
                                sep = '\t', index = False)
GM12878LowExpressionData = GM12878ExpressionData[-GM12878QuartileCutoff:]
GM12878LowExpressionData.to_csv(os.path.join(GM12878GeneExpressionDirectory, "GM12878_expression_bottom_quartile.tsv"),
                               sep = '\t', index = False)

In [None]:
# Subset base TSSs
baseTSSs = pd.read_table(os.path.join(TSSBaseDirectory, "hg19_protein_coding_genes_TSSs.bed"), header = None)
GM12878HighExpressionTSSs = baseTSSs[baseTSSs[3].apply(lambda x: x.split('.')[0]).isin(GM12878HighExpressionData["Gene_ID"])].reset_index(drop = True)
GM12878LowExpressionTSSs = baseTSSs[baseTSSs[3].apply(lambda x: x.split('.')[0]).isin(GM12878LowExpressionData["Gene_ID"])].reset_index(drop = True)

# Write new TSSs
GM12878HighExpressionTSSs.to_csv(os.path.join(GM12878HighExpressionTSSDirectory, "hg19_protein_coding_genes_TSSs_GM12878_high_expression.bed"),
                         sep = '\t', index = False, header = False)
GM12878LowExpressionTSSs.to_csv(os.path.join(GM12878LowExpressionTSSDirectory, "hg19_protein_coding_genes_TSSs_GM12878_low_expression.bed"),
                         sep = '\t', index = False, header = False)