This data loader:
1. Takes in raw gene expression counts
2. Selects only genes relevant to our project
3. Normalizes gene expression levels to the total expression of each cell
4. Saves the processed data into separate .csv files.

Input and output data loading are separated to prevent overloading of memory.

In [1]:
import pandas as pd

In [3]:
# filenames (CHANGE THESE AS NEEDED)
HOME = '/home/jupyter-dylan/'

OUTPUT_GENE_FILE = '{}gene labels/output_genes-2.txt'.format(HOME)

RAW_DATA_FILE = '{}data/GSE115746_cells_exon_counts.csv'.format(HOME)

# SUMS_FILE = '{}gene labels/cell_expr_sums.csv'.format(HOME)

SAVE_OUTPUT = '{}new_data/output-2.csv'.format(HOME)

In [4]:
# normalize cell expression counts by total (sum) expression for each cell

from functools import reduce
data_load = pd.read_csv(RAW_DATA_FILE, index_col=0, chunksize=10000)
sums_list = [chunk.sum(axis=0) for chunk in data_load]
sums = reduce(lambda x, y: x.add(y), sums_list)
# sums.to_csv(SUMS_FILE)

# sums = pd.read_csv(SUMS_FILE)
# sums = pd.read_csv(SUMS_FILE, header=0, index_col=0)

In [14]:
# sanity check
sums

F2S4_150422_002_A01    1204630
F2S4_150422_002_B01    1259862
F2S4_150422_002_C01    1128827
F2S4_150422_002_D01     879874
F2S4_150422_002_E01    1117094
                        ...   
F1S4_180124_317_D01    1022536
F1S4_180124_317_E01     724679
F1S4_180124_317_F01     980851
F1S4_180124_317_G01    1029766
F1S4_180124_317_H01     965509
Length: 23178, dtype: int64

# Load output data

In [8]:
# load output gene names
with open(OUTPUT_GENE_FILE) as f:
    output_genes = [gene.strip() for gene in f]   # remove '\n' at end of lines

In [9]:
# sanity check
len(output_genes)

500

In [10]:
# 1. load output data
# 2. only load genes of interest to conserve memory
data_load = pd.read_csv(RAW_DATA_FILE, index_col=0, chunksize=5000)
output_data = pd.concat([chunk.iloc[chunk.index.map(lambda x: x in output_genes)] for chunk in data_load])

In [11]:
# sanity check
output_data.head()

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
1700001J03Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2510039O18Rik,5,0,0,0,0,8,0,1,0,0,...,0,0,0,34,13,0,0,0,5,21
2810408A11Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17
3110021N24Rik,3,2,10,0,1,0,5,3,0,3,...,14,4,13,0,4,4,11,4,9,2
4933412E24Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
del output_genes

In [15]:
# 3. normalize output data
output_data_norm = output_data.div(sums, axis=1)

In [16]:
# sanity check
output_data_norm.head()

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
1700001J03Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2510039O18Rik,4e-06,0.0,0.0,0.0,0.0,7e-06,0.0,8.579155e-07,0.0,0.0,...,0.0,0.0,0.0,2.9e-05,2.1e-05,0.0,0.0,0.0,5e-06,2.2e-05
2810408A11Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8e-05
3110021N24Rik,2e-06,2e-06,9e-06,0.0,8.951798e-07,0.0,5e-06,2.573746e-06,0.0,2e-06,...,1.9e-05,4e-06,1.3e-05,0.0,6e-06,4e-06,1.5e-05,4e-06,9e-06,2e-06
4933412E24Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# transpose output data
# ensures that each row is an example and each column is a feature (gene)
# this form is (more) ready to feed into a model
output_data_norm = output_data_norm.transpose()

In [19]:
# 5. save input dataframes as new .csv files to save time when loading
output_data_norm.to_csv(SAVE_OUTPUT)

In [None]:
del output_train, output_test, output_data