## Preprocess the DNA methylation (epigenomic) dataset

In [14]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import os

We first need the path to our folder containing case-organized data and the destination for storing the processed epigenomic data.

In [15]:
ORGANIZED_BY_CASE_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_by_cases"
DESTINATION_DATA_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_processed/PRCSD_epigenomic_data.csv"

We use the following function to read in DNA methylation data. This function should be adapted to the format of DNA methylation data used for a project. 

In [16]:
def read_methylation(filepath, case_id):
    return pd.read_csv(filepath, sep='\t', header = None).set_index(0).rename(columns={1:case_id})

In [17]:
cases = os.listdir(ORGANIZED_BY_CASE_PATH)
cases[0:10]

['TCGA-35-4122',
 'TCGA-75-6203',
 'TCGA-75-5146',
 'TCGA-78-8648',
 'TCGA-55-A4DG',
 'TCGA-MP-A4SY',
 'TCGA-67-3771',
 'TCGA-44-A479',
 'TCGA-78-7156',
 'TCGA-55-7724']

Loop through every case filepath and search for epigenomic data. Apply the read CSV function to each epigenomic data found. After all the epigenomic files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective methylation values.

In [18]:
epigenomic_data = []
l = len(cases)
for i, case in enumerate(cases):
    #print(f"Case {i}/{l}")
    contents_gene_meth = os.listdir(os.path.join(ORGANIZED_BY_CASE_PATH, case, "dna_methylation"))
    if len(contents_gene_meth) == 0:
        print(f"{case} has no methylation data")
    else:
        filename = contents_gene_meth[0]
        path = os.path.join(ORGANIZED_BY_CASE_PATH, case, "dna_methylation", filename)
        epigenomic_data.append(read_methylation(path, case))       

TCGA-05-4245 has no methylation data
TCGA-44-2664 has no methylation data
TCGA-67-3776 has no methylation data
TCGA-67-3770 has no methylation data


The data looks like the following. As shown, the dataset should be transposed after concatenation.

In [19]:
epigenomic_data[0]

Unnamed: 0_level_0,TCGA-35-4122
0,Unnamed: 1_level_1
cg22501393,0.036237
cg18895155,0.017973
cg27126442,0.503393
cg15264255,0.128218
cg18464559,0.023375
...,...
cg10265786,0.009094
cg08096038,0.864527
cg05535113,0.204837
cg26848248,0.022390


In [20]:
all_epigenomic = pd.concat(epigenomic_data, axis = 1)
all_epigenomic.head()

Unnamed: 0_level_0,TCGA-35-4122,TCGA-75-6203,TCGA-75-5146,TCGA-78-8648,TCGA-55-A4DG,TCGA-MP-A4SY,TCGA-67-3771,TCGA-44-A479,TCGA-78-7156,TCGA-55-7724,...,TCGA-55-8090,TCGA-73-4670,TCGA-73-4668,TCGA-05-4422,TCGA-86-7714,TCGA-64-5775,TCGA-05-4418,TCGA-62-8398,TCGA-55-8097,TCGA-S2-AA1A
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg22501393,0.036237,0.072509,0.082956,0.066536,0.087807,0.084434,0.032928,0.066551,0.072734,0.085435,...,0.073007,0.042333,0.032782,0.028549,0.082535,0.082374,0.034011,0.096798,0.072706,0.078151
cg18895155,0.017973,0.016707,0.013216,0.020478,0.025055,0.027104,0.020942,0.018722,0.016225,0.016588,...,0.0168,0.014032,0.017088,0.017525,0.022279,0.015395,0.020096,0.017866,0.018051,0.020128
cg27126442,0.503393,0.572085,0.61412,0.80644,0.463099,0.504607,0.35383,0.74384,0.231782,0.610359,...,0.341039,0.228794,0.468894,0.273344,0.341466,0.766534,0.312059,0.445671,0.385491,0.635967
cg15264255,0.128218,0.053483,0.053742,0.046997,0.054819,0.056879,0.100819,0.049452,0.048707,0.05375,...,0.056473,0.113974,0.129406,0.092835,0.053205,0.035275,0.153293,0.046682,0.038005,0.053797
cg18464559,0.023375,0.024562,0.026393,0.026298,0.031556,0.02678,0.022435,0.023429,0.031727,0.027241,...,0.02593,0.019299,0.016195,0.019415,0.033429,0.023159,0.017379,0.030446,0.030631,0.032406


We remove any features with NAs, transpose the matrix so cases are rows and features are columns, then finally reset the index to the case id.

In [21]:
all_epigenomic = all_epigenomic.dropna()
all_epigenomic = all_epigenomic.transpose()
all_epigenomic = all_epigenomic.reset_index().rename(columns={"index": "case_id"})
all_epigenomic

Unnamed: 0,case_id,cg22501393,cg18895155,cg27126442,cg15264255,cg18464559,cg20379125,cg12790134,cg07697569,cg13613532,...,cg06453691,cg14474880,cg11500797,cg04995095,cg19815376,cg23207527,cg20880234,cg10265786,cg26848248,cg09906309
0,TCGA-35-4122,0.036237,0.017973,0.503393,0.128218,0.023375,0.025185,0.012751,0.018539,0.031869,...,0.762482,0.049937,0.315507,0.377128,0.454613,0.834547,0.021432,0.009094,0.022390,0.059245
1,TCGA-75-6203,0.072509,0.016707,0.572085,0.053483,0.024562,0.040282,0.055873,0.016879,0.045806,...,0.946361,0.051249,0.262323,0.744453,0.406976,0.760980,0.043046,0.089766,0.065425,0.036929
2,TCGA-75-5146,0.082956,0.013216,0.614120,0.053742,0.026393,0.062964,0.066770,0.013661,0.041391,...,0.934297,0.045764,0.709449,0.369011,0.372617,0.770362,0.038415,0.038355,0.045358,0.028891
3,TCGA-78-8648,0.066536,0.020478,0.806440,0.046997,0.026298,0.046908,0.036235,0.018256,0.045744,...,0.942902,0.071553,0.228389,0.686497,0.408400,0.705605,0.043201,0.057632,0.037235,0.040959
4,TCGA-55-A4DG,0.087807,0.025055,0.463099,0.054819,0.031556,0.075073,0.049878,0.021182,0.047862,...,0.914922,0.048047,0.636668,0.423101,0.576572,0.810681,0.056493,0.062269,0.046892,0.045805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,TCGA-64-5775,0.082374,0.015395,0.766534,0.035275,0.023159,0.069725,0.084924,0.016146,0.043354,...,0.844081,0.048832,0.595574,0.211882,0.468313,0.921748,0.043156,0.055920,0.078488,0.051482
513,TCGA-05-4418,0.034011,0.020096,0.312059,0.153293,0.017379,0.019883,0.016586,0.018790,0.026453,...,0.527585,0.031139,0.355488,0.235219,0.412189,0.863683,0.021754,0.008900,0.022660,0.050890
514,TCGA-62-8398,0.096798,0.017866,0.445671,0.046682,0.030446,0.042386,0.066264,0.020619,0.039939,...,0.930574,0.056880,0.407235,0.318863,0.577499,0.845899,0.043807,0.074081,0.106032,0.037916
515,TCGA-55-8097,0.072706,0.018051,0.385491,0.038005,0.030631,0.034586,0.080744,0.015566,0.040711,...,0.891794,0.043313,0.399839,0.590361,0.370045,0.847985,0.040767,0.056788,0.050266,0.041651


Save the processed epigenetic data.

In [22]:
all_epigenomic.to_csv(DESTINATION_DATA_PATH)

---