<div style="text-align: right"> [CMSC 636] Drake Wong </div>

# Extract Data
#### Step 1: Import Packages

In [1]:
import h5py
import numpy as np
from pathlib import Path
import pandas as pd
import json

#### Step 2: Select 10% of well-represented data 
Note: Depending on the platform and study, some data may be incomplete with 0s or NaN values in the hdf5 file. The following code selects genes that are well represented in the entire dataset. It does not give preference to a gene's expression value

In [2]:

file = Path(r'C:\Research_Data\CMSC 636 Deep Learning\expr_data.hdf5')
with pd.HDFStore(file, 'r') as store:
    print(store.info())
    rpkm = store['rpkm']
    alignment_metadata  = store['alignment_metadata']
    experiment_metadata = store['experiment_metadata']
    rpkm = rpkm.fillna(0)
    rpkm = rpkm[(rpkm == 0).sum(axis=0).sort_values(ascending=True).index]
    rpkm = rpkm.iloc[:,0:2300]

<class 'pandas.io.pytables.HDFStore'>
File path: C:\Research_Data\CMSC 636 Deep Learning\expr_data.hdf5
/alignment_metadata             frame        (shape->[136118,5])    
/experiment_metadata            frame        (shape->[136118,12])   
/rpkm                           frame        (shape->[108814,20499])


#### Step 3: Extract gene expression values for well-represented tissue types

In [3]:
print(experiment_metadata.cell_type.str.contains(pat="CD4+").sum())
CD4 = experiment_metadata.cell_type.str.contains(pat="CD4+", na=False)
CD4 = rpkm[CD4]
#CD4.head()
# CD4.isnull().sum().sort_values()    # Check to make sure no NaN exist
#(CD4 == 0).sum(axis=0).sort_values(ascending=True)     # Check to see how many missing values exist
CD4.to_csv(f"CD4.csv", na_rep="NA")

3476


  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
print(experiment_metadata.cell_type.str.contains(pat="CD8").sum())
CD8 = experiment_metadata.cell_type.str.contains(pat="CD8", na=False)
CD8 = rpkm[CD8]
CD8.to_csv(f"CD8.csv", na_rep="NA")

1499


  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
print(experiment_metadata.cell_type.str.contains(pat="Haematopoietic").sum())
Haematopoietic = experiment_metadata.cell_type.str.contains(pat="Haematopoietic", na=False)
Haematopoietic = rpkm[Haematopoietic]
Haematopoietic.to_csv(f"Haematopoietic.csv", na_rep="NA")

1226


  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
print(experiment_metadata.cell_type.str.contains(pat="BMDC").sum())
BMDC = experiment_metadata.cell_type.str.contains(pat="BMDC", na=False)
BMDC = rpkm[BMDC]
BMDC.to_csv(f"BMDC.csv", na_rep="NA")

4647


  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
print(experiment_metadata.cell_type.str.contains(pat="effector").sum())
CD4memory = experiment_metadata.cell_type.str.contains(pat="effector", na=False)
CD4memory = rpkm[CD4memory]
CD4memory.to_csv(f"CD4memory.csv", na_rep="NA")

2261


  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
print(experiment_metadata.series.str.contains(pat="GSE99251").sum())
Olfactory = experiment_metadata.series.str.contains(pat="GSE99251", na=False)
Olfactory = rpkm[Olfactory]
Olfactory.to_csv(f"Olfactory.csv", na_rep="NA")

1338


  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
print(experiment_metadata.series.str.contains(pat="GSE99235").sum())
Lung = experiment_metadata.series.str.contains(pat="GSE99235", na=False)
Lung = rpkm[Lung]
Lung.to_csv(f"Lung.csv", na_rep="NA")

1503


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
print(experiment_metadata.series.str.contains(pat="GSE98131").sum())
Mammery = experiment_metadata.series.str.contains(pat="GSE98131", na=False)
Mammery = rpkm[Mammery]
Mammery.to_csv(f"Mammery.csv", na_rep="NA")

1293


  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
print(experiment_metadata.series.str.contains(pat="GSE109796").sum())
Brain = experiment_metadata.series.str.contains(pat="GSE109796", na=False)
Brain = rpkm[Brain]
Brain.to_csv(f"Brain.csv", na_rep="NA")

2545


  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
print(experiment_metadata.series.str.contains(pat="GSE92332").sum())
intestinal = experiment_metadata.series.str.contains(pat="GSE92332", na=False)
intestinal = rpkm[intestinal]
intestinal.to_csv(f"intestinal.csv", na_rep="NA")

1891


  This is separate from the ipykernel package so we can avoid doing imports until


#### Concatenate well-represented expression values into single table, which will be used by WGCNA

In [13]:
concat_df = pd.concat([CD4, CD8, Haematopoietic, BMDC, CD4memory, Olfactory, Lung, Mammery, Brain, intestinal])
print(f"The concated dataframe contains {concat_df.shape[0]} samples")
concat_df.to_csv(f"concated_data.csv", na_rep="NA")

The concated dataframe contains 21679 samples


In [14]:
concat_df_small = pd.concat([CD4.iloc[0:100,:], CD8.iloc[0:100,:], Haematopoietic.iloc[0:100,:], BMDC.iloc[0:100,:], CD4memory.iloc[0:100,:], Olfactory.iloc[0:100,:], Lung.iloc[0:100,:], Mammery.iloc[0:100,:], Brain.iloc[0:100,:], intestinal.iloc[0:100,:]])
print(f"The concated dataframe contains {concat_df_small.shape[0]} samples")
concat_df_small.to_csv(f"concated_small_data.csv", na_rep="NA")

The concated dataframe contains 910 samples
