This data loader:
1. Takes in raw gene expression counts
2. Selects only genes relevant to our project
3. Normalizes gene expression levels to the total expression of each cell
4. Splits data into training and testing sets
5. Saves the processed data into separate .csv files.

Input and output data loading are separated to prevent overloading of memory.

In [1]:
import pandas as pd

In [2]:
# filenames (CHANGE THESE AS NEEDED)
HOME = '/home/jupyter-dylan/'

INPUT_GENE_FILE = '{}gene labels/input_genes.txt'.format(HOME)
OUTPUT_GENE_FILE = '{}gene labels/output_genes-1.txt'.format(HOME)

TRAIN_CELLS_FILE = '{}new_data/train_cells.txt'.format(HOME)
TEST_CELLS_FILE = '{}new_data/test_cells.txt'.format(HOME)

RAW_DATA_FILE = '{}data/GSE115746_cells_exon_counts.csv'.format(HOME)

SUMS_FILE = '{}gene labels/cell_expr_sums.csv'.format(HOME)

SAVE_INPUT_TRAIN = '{}new_data/input_train.csv'.format(HOME)
SAVE_INPUT_TEST = '{}new_data/input_test.csv'.format(HOME)
SAVE_OUTPUT_TRAIN = '{}new_data/output_train-1.csv'.format(HOME)
SAVE_OUTPUT_TEST= '{}new_data/output_test-1.csv'.format(HOME)

In [3]:
# normalize cell expression counts by total (sum) expression for each cell

# from functools import reduce
# data_load = pd.read_csv(RAW_DATA_FILE, index_col=0, chunksize=10000)
# sums_list = [chunk.sum(axis=0) for chunk in data_load]
# sums = reduce(lambda x, y: x.add(y), sums_list)
# sums.to_csv(SUMS_FILE)

# sums = pd.read_csv(SUMS_FILE)
sums = pd.read_csv(SUMS_FILE, header=0, index_col=0)

In [4]:
# sanity check
sums

Unnamed: 0,0
F2S4_150422_002_A01,1204630
F2S4_150422_002_B01,1259862
F2S4_150422_002_C01,1128827
F2S4_150422_002_D01,879874
F2S4_150422_002_E01,1117094
...,...
F1S4_180124_317_D01,1022536
F1S4_180124_317_E01,724679
F1S4_180124_317_F01,980851
F1S4_180124_317_G01,1029766


In [5]:
# load train and test cells
with open(TRAIN_CELLS_FILE) as f:
    train_cells = [cell.strip() for cell in f]

with open(TEST_CELLS_FILE) as f:
    test_cells = [cell.strip() for cell in f]

In [6]:
# sanity check
set(train_cells).intersection(set(test_cells))

set()

# Load input data

In [7]:
# load input gene names
with open(INPUT_GENE_FILE) as f:
    input_genes = [gene.strip() for gene in f]   # remove '\n' at end of lines

In [8]:
# sanity check
input_genes

['Zfp182',
 'En1',
 'Tead3',
 'Pou6f2',
 'Nfix',
 'Thra',
 'Efemp1',
 'Neurog3',
 'Rasal3',
 'Smad5',
 'Crebbp',
 'Hif1a',
 'Suz12',
 'Onecut1',
 'Ubp1',
 'Kat8',
 'Bhlhe41',
 'Hoxb5',
 'Deaf1',
 'Zfp763',
 'Klf10',
 'Nfat5',
 'Mzf1',
 'Zkscan1',
 'Foxj3',
 'Itgb3bp',
 'Zfp709',
 'Nfxl1',
 'Asxl2',
 'Emx2',
 'Tfap2c',
 'Gm3854',
 'Cxxc1',
 'Insr',
 'Pou3f2',
 'Spib',
 'Tbx3',
 'Rax',
 'Taf1',
 'Csrnp2',
 'Zfp319',
 'Mnx1',
 'Zfp110',
 'Foxe3',
 'Trp73',
 'Mta1',
 'Arid3a',
 'Tmpo',
 'Ppara',
 'Pcgf6',
 'Zfp59',
 'Shh',
 'Trp53inp1',
 'Foxp1',
 'Plag1',
 'Gsc',
 'Sbno2',
 'Zfp280b',
 'Pou5f1',
 'Ets1',
 'Myog',
 'Cds1',
 'Tcf7',
 'Dlx1',
 'Barx2',
 'Stat6',
 'Zfp280c',
 'Prdm15',
 'Zgpat',
 'Pitx2',
 'Sox5',
 'Foxb1',
 'Ntn3',
 'Bax',
 'Wiz',
 'Emx1',
 'Dmrtc2',
 'Lpin1',
 'Znfx1',
 'Mterf1a',
 'Zfp82',
 'Gm14325',
 'Mga',
 'Creb3l2',
 'Lhx6',
 'Eaf2',
 'Nkx6-3',
 'Lyl1',
 'Mbtd1',
 'Neurod6',
 'Atxn1l',
 'Dlx2',
 'E2f7',
 'Spic',
 'Hif3a',
 'Prdm13',
 'Bhlha15',
 'Hinfp',
 'Ajuba',
 'Z

In [9]:
# 1. load input data
# 2. only load genes of interest to conserve memory
data_load = pd.read_csv(RAW_DATA_FILE, index_col=0, chunksize=5000)
input_data = pd.concat([chunk.iloc[chunk.index.map(lambda x: x in input_genes)] for chunk in data_load])

In [10]:
# sanity check
input_data.head()

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
2210018M11Rik,101,25,118,49,104,97,61,191,57,156,...,48,56,67,55,3,57,34,64,35,20
2610008E11Rik,86,151,6,0,14,8,62,176,64,6,...,35,0,67,18,0,0,16,0,19,11
2810021J22Rik,32,13,0,30,0,2,0,15,2,1,...,1,0,48,2,0,0,0,63,0,0
9130023H24Rik,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
9430076C15Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
del input_genes

In [12]:
# 3. normalize input data
input_data_norm = input_data.div(sums['0'], axis=1)

In [13]:
# sanity check
input_data_norm.head()

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
2210018M11Rik,8.4e-05,2e-05,0.000105,5.6e-05,9.3e-05,8.2e-05,6e-05,0.000164,5.2e-05,8.036018e-05,...,6.6e-05,5.7e-05,6.5e-05,4.7e-05,5e-06,5.6e-05,4.7e-05,6.5e-05,3.39883e-05,2.1e-05
2610008E11Rik,7.1e-05,0.00012,5e-06,0.0,1.3e-05,7e-06,6.1e-05,0.000151,5.8e-05,3.090776e-06,...,4.8e-05,0.0,6.5e-05,1.5e-05,0.0,0.0,2.2e-05,0.0,1.845079e-05,1.1e-05
2810021J22Rik,2.7e-05,1e-05,0.0,3.4e-05,0.0,2e-06,0.0,1.3e-05,2e-06,5.151293e-07,...,1e-06,0.0,4.6e-05,2e-06,0.0,0.0,0.0,6.4e-05,0.0,0.0
9130023H24Rik,0.0,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,9.710944e-07,0.0
9430076C15Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# transpose input data
# ensures that each row is an example and each column is a feature (gene)
# this form is (more) ready to feed into a model
input_data_norm = input_data_norm.transpose()

In [15]:
# 4. input test/train split
input_train = input_data_norm.loc[input_data_norm.index.map(lambda x: x in train_cells)]
input_test = input_data_norm.loc[input_data_norm.index.map(lambda x: x in test_cells)]

In [16]:
# sanity check
input_test.head()

Unnamed: 0,2210018M11Rik,2610008E11Rik,2810021J22Rik,9130023H24Rik,9430076C15Rik,A430033K04Rik,A530054K11Rik,AA987161,AI987944,AU041133,...,Zscan12,Zscan2,Zscan20,Zscan21,Zscan22,Zscan26,Zscan4c,Zscan4d,Zscan4f,Zxdc
F2S4_150422_002_B01,2e-05,0.00012,1e-05,0.0,0.0,7.937377e-07,0.0,4e-06,0.0,9.524853e-06,...,7.937377e-07,0.0,0.0,1.3e-05,0.0,6e-06,0.0,0.0,0.0,2e-06
F2S4_150422_002_D01,5.6e-05,0.0,3.4e-05,0.0,0.0,0.0,1.4e-05,0.0,6e-05,9.092211e-06,...,6.591853e-05,0.0,0.0,8.1e-05,0.0,2.4e-05,0.0,0.0,0.0,1e-06
F2S4_150422_002_H01,0.000164,0.000151,1.3e-05,0.0,0.0,4.289577e-05,0.0,3.3e-05,5.8e-05,8.579155e-07,...,0.0,0.0,0.0,5e-06,4.8e-05,4e-06,0.0,0.0,0.0,0.0
F2S4_150428_001_A01,0.000128,2.7e-05,0.0,0.0,0.0,1.081749e-06,0.0,0.0,0.0,0.0,...,3.028898e-05,0.0,0.0,9e-05,5e-05,4.3e-05,0.0,0.0,0.0,0.0
F2S4_150430_003_E01,5.7e-05,1.6e-05,1e-06,0.0,0.0,1.457974e-05,6.7e-05,0.0,1e-05,0.0,...,1.943965e-05,0.0,0.0,1.5e-05,1.5e-05,0.0,0.0,0.0,0.0,5e-06


In [17]:
# 5. save input dataframes as new .csv files to save time when loading
input_train.to_csv(SAVE_INPUT_TRAIN)
input_test.to_csv(SAVE_INPUT_TEST)

In [18]:
del input_train, input_test, input_data

# Load output data

In [19]:
# load output gene names
with open(OUTPUT_GENE_FILE) as f:
    output_genes = [gene.strip() for gene in f]   # remove '\n' at end of lines

In [20]:
# sanity check
output_genes

['App',
 'Apoe',
 'Gusb',
 'Lamp5',
 'Mbp',
 'Pvalb',
 'S100b',
 'Slc30a3',
 'Snca',
 'Mapt']

In [None]:
# 1. load output data
# 2. only load genes of interest to conserve memory
data_load = pd.read_csv(RAW_DATA_FILE, index_col=0, chunksize=5000)
output_data = pd.concat([chunk.iloc[chunk.index.map(lambda x: x in output_genes)] for chunk in data_load])

In [None]:
# sanity check
output_data.head()

In [None]:
del output_genes

In [None]:
# 3. normalize output data
output_data_norm = output_data.div(sums['0'], axis=1)

In [None]:
# sanity check
output_data_norm.head()

In [None]:
# transpose output data
# ensures that each row is an example and each column is a feature (gene)
# this form is (more) ready to feed into a model
output_data_norm = output_data_norm.transpose()

In [None]:
# 4. output test/train split
output_train = output_data_norm.loc[output_data_norm.index.map(lambda x: x in train_cells)]
output_test = output_data_norm.loc[output_data_norm.index.map(lambda x: x in test_cells)]

In [None]:
# sanity check
output_test.head()

In [None]:
# 5. save input dataframes as new .csv files to save time when loading
output_train.to_csv(SAVE_OUTPUT_TRAIN)
output_test.to_csv(SAVE_OUTPUT_TEST)

In [None]:
del output_train, output_test, output_data