## Preprocess the copy-number variation (CNV) dataset

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import os
import seaborn as sns

We first need the path to our folder containing case-organized data and the destination for storing the processed CNV data.

In [37]:
ORGANIZED_BY_CASE_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_by_cases"
DESTINATION_DATA_PATH = "/users/anair27/data/TCGA_Data/project_LUAD/data_processed/PRCSD_cnv_data.csv"

We use the following function to read in CNV data. This function should be adapted to the format of CNV data used for a project. 

In [38]:
def read_cnv(filepath, case_id):
    arr = []
    with open(filepath) as f:
        lines = f.readlines()
        for l in lines:
            arr.append(l.upper().split())
    # transform 2d array into dataframe
    matrix = pd.DataFrame(arr)
    # get gene names as column names
    matrix.columns = matrix.iloc[0]
    # drop the column
    matrix = matrix.drop(0)
    # replace missing values with -1
    matrix["COPY_NUMBER"].fillna("-1", inplace = True)
    # transpose matrix and set ID to gene_ID
    matrix= matrix[["GENE_ID", "COPY_NUMBER"]].set_index("GENE_ID").transpose()
    # rename copy number column with case IDs
    return matrix.rename(columns={'GENE_ID': 'CASE_ID'},index={'COPY_NUMBER': case_id}).reset_index().rename(columns={0:'CASE_ID'})

In [39]:
cases = os.listdir(ORGANIZED_BY_CASE_PATH)
cases[0:10]

['TCGA-35-4122',
 'TCGA-75-6203',
 'TCGA-75-5146',
 'TCGA-78-8648',
 'TCGA-55-A4DG',
 'TCGA-MP-A4SY',
 'TCGA-67-3771',
 'TCGA-44-A479',
 'TCGA-78-7156',
 'TCGA-55-7724']

Loop through every case filepath and search for CNV data. Apply the read CSV function to each CNV data found. After all the CNV files are read, we can concatenate them to create a matrix where rows are cases, columns are genomic regions, and values are the respective copy numbers.

In [40]:
_cnv_data = []
i=0
for case in cases:
    contents_gene_exp = os.listdir(os.path.join(ORGANIZED_BY_CASE_PATH, case, "cnv"))
    if len(contents_gene_exp) == 0:
        i+=1
        print(f"{case} has no CNV expression data")
    else:
        filename = contents_gene_exp[0]
        path = os.path.join(ORGANIZED_BY_CASE_PATH, case, "cnv", filename)
        _cnv_data.append(read_cnv(path, case))

TCGA-05-4245 has no CNV expression data
TCGA-71-8520 has no CNV expression data
TCGA-44-2664 has no CNV expression data
TCGA-50-5946 has no CNV expression data
TCGA-55-7816 has no CNV expression data
TCGA-50-5930 has no CNV expression data
TCGA-67-3776 has no CNV expression data
TCGA-MP-A4T9 has no CNV expression data
TCGA-55-7227 has no CNV expression data
TCGA-69-7979 has no CNV expression data
TCGA-38-4631 has no CNV expression data
TCGA-78-7159 has no CNV expression data
TCGA-MP-A4TA has no CNV expression data
TCGA-55-6984 has no CNV expression data
TCGA-MP-A4TC has no CNV expression data


In [44]:
all_cnv_data = pd.concat(_cnv_data)
all_cnv_data

GENE_ID,CASE_ID,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000284332.1,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.2,ENSG00000186092.6,...,ENSG00000237801.6_PAR_Y,ENSG00000237040.6_PAR_Y,ENSG00000124333.16_PAR_Y,ENSG00000228410.6_PAR_Y,ENSG00000223484.7_PAR_Y,ENSG00000124334.17_PAR_Y,ENSG00000270726.6_PAR_Y,ENSG00000185203.12_PAR_Y,ENSG00000182484.15_PAR_Y,ENSG00000227159.8_PAR_Y
0,TCGA-35-4122,-1,-1,-1,-1,-1,-1,-1,3,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-75-6203,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-75-5146,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-78-8648,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-55-A4DG,-1,-1,-1,-1,-1,-1,-1,4,4,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,TCGA-64-5775,-1,-1,-1,-1,-1,-1,-1,3,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-05-4418,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-62-8398,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
0,TCGA-55-8097,-1,-1,-1,-1,-1,-1,-1,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


We apply some formatting to make the case ID the index of the dataframe.

In [45]:
# reset index to case ID
all_cnv_data = all_cnv_data.rename(columns={"CASE_ID":"case_id"}).set_index("case_id")
all_cnv_data.head()

GENE_ID,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000284332.1,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.2,ENSG00000186092.6,ENSG00000238009.6,...,ENSG00000237801.6_PAR_Y,ENSG00000237040.6_PAR_Y,ENSG00000124333.16_PAR_Y,ENSG00000228410.6_PAR_Y,ENSG00000223484.7_PAR_Y,ENSG00000124334.17_PAR_Y,ENSG00000270726.6_PAR_Y,ENSG00000185203.12_PAR_Y,ENSG00000182484.15_PAR_Y,ENSG00000227159.8_PAR_Y
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-35-4122,-1,-1,-1,-1,-1,-1,-1,3,3,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
TCGA-75-6203,-1,-1,-1,-1,-1,-1,-1,2,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
TCGA-75-5146,-1,-1,-1,-1,-1,-1,-1,2,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
TCGA-78-8648,-1,-1,-1,-1,-1,-1,-1,2,2,2,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
TCGA-55-A4DG,-1,-1,-1,-1,-1,-1,-1,4,4,4,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


For some preliminary feature reduction, we drop any columns that only have one unique value or have missing values.

In [46]:
i = 0;
to_drop = []
for col in all_cnv_data.columns:
    if len(all_cnv_data[col].unique())== 1 or ('-1' in all_cnv_data[col].unique()):
        to_drop.append(col)
        i+=1;
        # print(col)

print(f"{i} columns in data will be dropped, out of {len(all_cnv_data.columns)}")

2302 columns in data will be dropped, out of 60623


In [47]:
all_cnv_data= all_cnv_data.drop(columns = to_drop)
all_cnv_data.head()

GENE_ID,ENSG00000240361.2,ENSG00000186092.6,ENSG00000238009.6,ENSG00000239945.1,ENSG00000233750.3,ENSG00000268903.1,ENSG00000269981.1,ENSG00000239906.1,ENSG00000241860.7,ENSG00000222623.1,...,ENSG00000229238.3,ENSG00000252948.1,ENSG00000233843.1,ENSG00000188399.5,ENSG00000277146.1,ENSG00000215506.5,ENSG00000224240.1,ENSG00000227629.1,ENSG00000237917.1,ENSG00000231514.1
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-35-4122,3,3,3,3,3,3,3,3,3,3,...,2,2,2,2,2,2,2,2,2,2
TCGA-75-6203,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0
TCGA-75-5146,2,2,2,2,2,2,2,2,2,2,...,3,3,3,3,3,3,3,3,3,3
TCGA-78-8648,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0
TCGA-55-A4DG,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


Save the processed CNV data.

In [48]:
all_cnv_data=all_cnv_data.rename(columns={'CASE_ID':'case_id'})
all_cnv_data.to_csv(DESTINATION_DATA_PATH)

---