In [1]:
"""
quick notebook to subset allen data ISH data for URP course 2019

Shaina Lu
June 2019"""

'\nquick notebook to subset allen data ISH data for URP course 2019\n\nShaina Lu\nJune 2019'

In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression

In [3]:
ALLEN_FILT_PATH = "/data/slu/allen_adult_mouse_ISH/allen_adultmouse_filt_v5_avgdup.h5"
ONTOLOGY_PATH = "/data/slu/allen_adult_mouse_ISH/ontologyABA.csv"

In [4]:
metabrain = pd.read_hdf(ALLEN_FILT_PATH, key='metabrain', mode='r')
voxbrain = pd.read_hdf(ALLEN_FILT_PATH, key='avgvoxbrain', mode='r')
propontvox = pd.read_hdf(ALLEN_FILT_PATH, key='propontology', mode='r')
geneIDName = pd.read_hdf(ALLEN_FILT_PATH, key='geneIDName', mode='r')

In [11]:
#read in ontology file
ontology = pd.read_csv(ONTOLOGY_PATH)
ontology = ontology.drop([ontology.columns[5], ontology.columns[6]], axis=1)
ontology = ontology.fillna(-1)  #make root's parent -1

In [8]:
def splitdata(data, testratio):
    #set seed so train and test will always split the same in diff run so ML algorithm doesn't see whole dataset (BAD)
    np.random.seed(42)
    shuffindices = np.random.permutation(len(data))
    testsize = int(len(data) * testratio)
    testindices = shuffindices[:testsize]
    trainindices = shuffindices[testsize:]
    return data.iloc[trainindices], data.iloc[testindices]

In [9]:
#since the function is seeded, all these will be split in the same way
voxtrain, voxtest = splitdata(voxbrain, 0.5)  #hold aside 50% data as test set
metatrain, metatest = splitdata(metabrain, 0.5)
ontvoxtrain, ontvoxtest = splitdata(propontvox, 0.5)

In [17]:
currarea = "549" #thalamus
curracro = ontology.loc[ontology.id == int(currarea), "acronym"].item()

In [19]:
ytrain = ontvoxtrain[currarea]
ytest = ontvoxtest[currarea]

In [20]:
#get differentially expressed genes

In [21]:
cols = list(voxtrain)

In [22]:
#one-sided Mann-Whitney, Ha: areaofinterest < not areaofinterest
u2 = []
pvals2 = []
genes2 = []
errors2 = []
for i in range(len(cols)):
    try:
        curr_u, curr_pval = sp.stats.mannwhitneyu(voxtrain.loc[ontvoxtrain[currarea] == 1,cols[i]],voxtrain.loc[ontvoxtrain[currarea] == 0,cols[i]],alternative='less')
        u2.append(curr_u)
        pvals2.append(curr_pval)
        genes2.append(cols[i])
    except:   #some genes raise the error that "all numbers are identical in mwu"
        errors2.append(cols[i])

In [23]:
#one-sided Mann-Whitney, Ha: areaofinterest > not areaofinterest
u3 = []
pvals3 = []
genes3 = []
errors3 = []
for i in range(len(cols)):
    try:
        curr_u, curr_pval = sp.stats.mannwhitneyu(voxtrain.loc[ontvoxtrain[currarea] == 1,cols[i]],voxtrain.loc[ontvoxtrain[currarea] == 0,cols[i]],alternative='greater')
        u3.append(curr_u)
        pvals3.append(curr_pval)
        genes3.append(cols[i])
    except:   #some genes raise the error that "all numbers are identical in mwu"
        errors3.append(cols[i])

In [24]:
#sort both lists of genes by pvals
s2 = sorted(zip(pvals2,genes2))
pvals2_sort, genes_sortbypvals2 = map(list, zip(*s2))
s3 = sorted(zip(pvals3,genes3))
pvals3_sort, genes_sortbypvals3 = map(list, zip(*s3))

In [25]:
#get top siggenes
numgenes = 100
topDE = genes_sortbypvals3[0:numgenes]  #genes_sortbypvals2[0:numgenes] + 

In [26]:
#get dataframes of only the topDE genes
topDEtest = voxtest.loc[:,topDE]
topDEtrain = voxtrain.loc[:,topDE]

In [27]:
topDEbrain = voxbrain.loc[:, topDE]

In [29]:
topDEbrain.shape

(62527, 100)

In [30]:
topDEtrain.shape

(31264, 100)

In [38]:
reg = LinearRegression().fit(topDEtrain, ytrain)

In [39]:
reg.score(topDEtrain,ytrain)

0.7630913510862172

In [42]:
topDEtrain.to_csv("/home/slu/urpcourse19/data/lecture7/ABAISHsubset.csv")

In [43]:
ytrain.to_csv("/home/slu/urpcourse19/data/lecture7/ABAISHsubset_labels.csv")