## Preprocess the gene expression RNA-seq (transcriptomic) dataset

In [25]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import os

We first need the path to our folder containing case-organized data and the destination for storing the processed transcriptomic data.

In [26]:
ORGANIZED_BY_CASE_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_by_cases"
DESTINATION_DATA_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_processed/PRCSD_transcriptomic_data.csv"

We use the following function to read in RNA-seq data. This function should be adapted to the format of gene expression data used for a project. We isolate RNA-seq data derived in "fragments per kilobase of exon per million mapped fragments" (FPKM). Only protein-coding genes are included for our analysis.

In [27]:
def read_gene_expression(filepath, case_id):
    arr = []
    with open(filepath) as f:
        lines = f.readlines()
        for l in lines:
            arr.append(l.upper().split())
    matrix = pd.DataFrame(arr)[1:]
    matrix.columns = matrix.iloc[0]
    matrix = matrix[matrix["GENE_TYPE"] == "PROTEIN_CODING"]
    matrix = matrix[['GENE_ID', 'FPKM_UNSTRANDED']].set_index('GENE_ID').transpose()
    return matrix.rename(columns={'GENE_ID': 'CASE_ID'},index={'FPKM_UNSTRANDED': case_id}).reset_index().rename(columns={1:'CASE_ID'})
    

In [28]:
cases = os.listdir(ORGANIZED_BY_CASE_PATH)
cases[0:10]

['TCGA-35-4122',
 'TCGA-75-6203',
 'TCGA-75-5146',
 'TCGA-78-8648',
 'TCGA-55-A4DG',
 'TCGA-MP-A4SY',
 'TCGA-67-3771',
 'TCGA-44-A479',
 'TCGA-78-7156',
 'TCGA-55-7724']

Loop through every case filepath and search for transcriptomic data. Apply the read CSV function to each transcriptomic data found. After all the transcriptomic files are read, we can concatenate them to create a matrix where rows are cases, columns are genes, and values are the respective expression values.

In [29]:
gene_exp_data = []
for case in cases:
    contents_gene_exp = os.listdir(os.path.join(ORGANIZED_BY_CASE_PATH, case, "gene_expression"))
    if len(contents_gene_exp) == 0:
        print(f"{case} has no gene expression data")
    else:
        filename = contents_gene_exp[0]
        path = os.path.join(ORGANIZED_BY_CASE_PATH, case, "gene_expression", filename)
        gene_exp_data.append(read_gene_expression(path, case))     

TCGA-05-4245 has no gene expression data
TCGA-44-2664 has no gene expression data
TCGA-67-3776 has no gene expression data
TCGA-44-A47F has no gene expression data
TCGA-MP-A4T2 has no gene expression data
TCGA-55-8615 has no gene expression data


In [30]:
all_gene_exp = pd.concat(gene_exp_data, axis = 0)
all_gene_exp.head()

GENE_ID,CASE_ID,ENSG00000000003.15,ENSG00000000005.6,ENSG00000000419.13,ENSG00000000457.14,ENSG00000000460.17,ENSG00000000938.13,ENSG00000000971.16,ENSG00000001036.14,ENSG00000001084.13,...,ENSG00000288649.1,ENSG00000288654.1,ENSG00000288656.1,ENSG00000288658.1,ENSG00000288660.1,ENSG00000288661.1,ENSG00000288669.1,ENSG00000288671.1,ENSG00000288674.1,ENSG00000288675.1
0,TCGA-35-4122,46.9281,0.4808,57.9971,1.5523,3.6818,9.8975,6.2302,25.281,7.0157,...,0.0,0.0,0.0,0.7707,0.0,0.0,0.0,0.0,0.0061,0.1877
0,TCGA-75-6203,11.7451,0.0,18.2141,1.8852,0.5896,25.3843,9.8348,17.5786,1.7791,...,0.0,0.0,0.0077,0.0302,0.0,0.0,0.0,0.0,0.0279,0.2131
0,TCGA-75-5146,20.3682,0.0225,31.1076,3.2412,0.962,3.9068,11.0951,22.6927,4.2451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0104,0.0988
0,TCGA-78-8648,5.6365,0.0,19.6206,1.4938,0.8152,26.8137,18.2827,20.7673,3.5553,...,0.0,0.0,0.0,0.2541,0.0,0.0,0.0,0.0,0.0107,0.2448
0,TCGA-55-A4DG,4.6527,0.0,30.7542,5.0779,1.0228,3.8962,6.0109,12.613,11.5452,...,0.0,0.0,0.0,0.2708,0.1105,0.0,0.0,0.0,0.0362,0.2391


In [31]:
all_gene_exp = all_gene_exp.rename(columns={"CASE_ID":"case_id"})

In [32]:
all_gene_exp.to_csv(DESTINATION_DATA_PATH)

----