# Data Preprocessing

In [2]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import gzip
import cPickle as cpkl

In [3]:
genes = []
counts = []
with gzip.open('data/singlecelldata_HSCi_umitab.txt.gz','r') as f:
    cells = f.next().strip().split('\t')
    cells = map(lambda s: s.replace('\"',''),cells)
    for l in f:
        l = l.strip().split('\t')
        genes += [l[0].replace('\"','')]
        counts += [map(int,l[1:])]

diff_genes  = {
    'Erythroid_Meg':['Klf1','Gata1','Mpl','Epor','Vwf','Zfpm1','Fhl1','Gpr64','Sdpr','Gypa','Tfrc','Hbb-b1', 'Hbb-y'],
    'Myeloid':['Gfi1','Sfpi1','Cebpb','Cebpa','Mpo','Csf2rb','Csf1r','Gfi1b','Hk3','Csf2ra','Csf3r','Sp1','Fcgr3','Fcgr4','Cxcr1'],
    'Lymphoid':['Tcf3','Ikzf1','Notch1','Flt3','Il7r','Ebf1','E2a','Pax5','Dntt','Btg2','Tcf7','Rag1','Rag2','Ccr7','Ptprc','Ly6a','Ly6d', 'Cd3', 'Ikzf3', 'Cd52', 'Blnk'],
    'HSC':['Slamf1','Itga2b','Kit','Ly6a','Bmi1','Gata2','Hlf','Meis1','Mpl','Mcl1','Gfi1','Gfi1b','Hoxb5']}

diff_genes_list =  set(diff_genes['Erythroid_Meg'] + diff_genes['Myeloid'] + diff_genes['Lymphoid'] +diff_genes['HSC'])

x = np.asarray(counts)


In [4]:
cell_names = gzip.open('data/singlecelldata_HSCi_umitab.txt.gz','r').readlines()[0].strip().replace("\"", "").split('\t')

In [5]:
print "no. of genes: %i." % len(genes)
print "no. of rna sequences: %i." % len(counts[0])

no. of genes: 27297.
no. of rna sequences: 1536.


In [6]:
xcounts = x.sum(axis=1)
diff_genes_id = []
diff_genes_found = []
for gd in diff_genes_list:
    found = False
    for i,g in enumerate(genes):
        g = g.split(';')[0].strip()
        if gd == g and xcounts[i] > 0:
            diff_genes_id += [i]
            diff_genes_found += [gd]
            found = True
            continue
    if not found:
        print "gene %s not found." % gd

diff_genes_id = np.asarray(diff_genes_id,dtype='int64')

gene Il7r not found.
gene Ly6d not found.
gene Rag2 not found.
gene E2a not found.
gene Cd3 not found.
gene Pax5 not found.
gene Cxcr1 not found.
gene Ccr7 not found.
gene Cebpb not found.
gene Ikzf3 not found.


In [7]:
cells_cluster = []
cluster = []
with open('data/initial_cluster_assignments_MAP.csv','r') as f:
    for l in f:
        l = l.split(',')
        cells_cluster += [l[0]]
        cluster += [int(l[1])]

In [8]:
idxs = []
cluster_ordered = []
for i in range(len(cell_names)):
    cn = cell_names[i]
    if cn in cells_cluster:
        idxs += [i] 
        idx = list(np.where(np.array(cells_cluster) == cn)[0])[0]
        cluster_ordered += [cluster[idx]]
    # else:
        # print "cell %s is not in cluster data." % cn 
idxs = np.asarray(idxs, dtype='int64')

In [10]:
x_train = x[diff_genes_id, :]
x_train = x_train[:, idxs].T
t_train = np.array(cluster_ordered, dtype='int64') - 1

genes = diff_genes_found
genes_id = diff_genes_id

print x_train.shape
print t_train.shape
print len(genes)
print len(genes_id)


with open('preprocessed_data.cpkl','w') as f:
    cpkl.dump({'x_train':x_train,'t_train':t_train,'genes':genes,'genes_id':genes_id},f,protocol=cpkl.HIGHEST_PROTOCOL)

(1430, 48)
(1430,)
48
48


In [15]:
words = []
sampleid = []
N,V = x_train.shape
print N,V
for n in range(0,N):
    for v in range(0,V):
        cw = x_train[n,v]
        if cw > 0:
            words += [v+1]*cw
            sampleid += [n+1]*cw
            

1430 48


In [16]:
with open('preprocessed_data.cpkl','w') as f:
    cpkl.dump({'x_train':x_train,'t_train':t_train,'genes':genes,'genes_id':genes_id, 'words':words,'sampleid':sampleid},f,protocol=cpkl.HIGHEST_PROTOCOL)

In [3]:
with open('samples_uniform_prior.cpkl','r') as f:
    samples_uni = cpkl.load(f)['samples']

with open('samples_infor_prior.cpkl','r') as f:
    samples_infor = cpkl.load(f)['samples']
        
theta_uni = samples_uni['theta'].mean(axis=0)
phi_uni = samples_uni['phi'].mean(axis=0)

theta_infor = samples_infor['theta'].mean(axis=0)
phi_infor = samples_infor['phi'].mean(axis=0)


with open('stan_results.cpkl','w') as f:
    cpkl.dump({'theta_uni':theta_uni,'phi_uni':phi_uni,'theta_infor':theta_infor,'phi_infor':phi_infor},f,protocol=cpkl.HIGHEST_PROTOCOL)

