In [None]:
%matplotlib inline
import os
import math 
import numpy as np
import pandas as pd 
import seaborn as sns
import random
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.stats import pearsonr, spearmanr, gaussian_kde
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import scale, StandardScaler
from statsmodels.sandbox.stats.multicomp import fdrcorrection0
from matplotlib.colors import rgb2hex
import warnings
sns.set_style('white')
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 999
warnings.filterwarnings('ignore')

ahba_dir     = '/Users/kanaaax/Google Drive/TS-EUROTRAIN/RESULTS_QSMv3/dataframes/AHBA/'
gsea_dir     = '/Users/kanaaax/Desktop/GSEA'
permute_dir  = '/Users/kanaaax/Google Drive/TS-EUROTRAIN/RESULTS_QSMv3/dataframes/permutations'
save_fig_dir = '/Users/kanaaax/Google Drive/TS-EUROTRAIN/Papers/2016_QSM_paper/Figures_python_v4'

# wells outside the striatal masks 
wells = [#2371,       # AHBA claustrum, right
         #2379,       # AHBA claustrum, left
         #159226045,  # AHBA striatum -- out of mask
         #160091500   # AHBA septal nuclei, left
         ] 
#housekeeping
drop_strings = ['coords_native', 'donor_names', 'struct_id', 'struct_name', 'top_struct', 'Mean', 'Median', 'PC1', 'PC2','PC3', ]


In [None]:
pls1 = pd.read_csv('/Users/kanaaax/Desktop/PLS1_geneWeights2.csv', header=None)
pls2 = pd.read_csv('/Users/kanaaax/Desktop/PLS2_geneWeights2.csv', header=None)

boot=pls1

#fig = plt.figure(figsize=(8, 6))
#ax = fig.add_subplot(111)
#sns.set_style("white")
g = sns.distplot(boot, color='b', hist=1, bins = 30,)

#q1=np.percentile(boot, 5)
#q2=np.percentile(boot, 95)
#plt.axvline(q1, color='k', linestyle='-.', linewidth=1)
#plt.axvline(q2, color='k', linestyle='-.', linewidth=1)

#sns.despine(left=1)
print q1,q2
#histtype : {'bar', 'barstacked', 'step', 'stepfilled'}, optional



PLS analysis is based on extracting the common information between the two data blocks by finding a correlation matrix and linear combinations of variables in both data blocks that have maximum covariance with one another. 

PLS analysis first calculates a mean-centered matrix using matrices X and Y. Then, singular value decomposition (SVD) is applied on the mean-centered matrix. The outcome of PLS analysis is a set of latent variables that are in fact linear combinations of initial variables of the two data blocks that maximally covary with the corresponding contrasts. More specifically, each latent variable consists of a set of singular values that describe the effect size, as well as a set of singular vectors, or weights, that define the contribution of each initial variable to the latent variables.

Finally, the statistical significance of a latent variable is defined by a p-value calculated from permutation test. In addition, bootstrapping is used to assess the reliability of each original variable (e.g. a source at a time point) that contributes to the latent variable. Bootstrap ratios are calculated for each original variable for this purpose. Bootstrap ratio is the ratio of the weights to the standard errors estimated from bootstrapping. Therefore, the larger the magnitude of a bootstrap ratio, the larger the weight (i.e. contribution to the latent variable) and the smaller the standard error (i.e. higher stability) (McIntosh and Lobaugh, 2004, Mišić et al., 2016). Bootstrap ratio can be equivalent to a z-score if we have an approximately normal bootstrap distribution (Efron and Tibshirani, 1986).

PLS analysis was explained in general in this section. However, this tutorial assumes that the users are already familiar with basics of PLS analysis. If PLS is new to you or if you want to read more about PLS and its applications in details, please refer to the articles introduced in “References” section.







### Cortical patterning of abnormal morphometric similarity in psychosis is associated with brain expression of schizophrenia-related genes. - Proc Natl Acad Sci 2019: 201820754. Morgan SE, Seidlitz J, Whitaker KJ, Romero-Garcia R, Clifton NE, Scarpazza C, et al. 


Transcriptomic Analysis. 

We used the AHBA transcriptomic dataset with gene expression measurements in six post mortem adult brains (36) (human.brain-map.org) ages 24–57 y. Each tissue sample was assigned to an anatomical structure using the AHBA MRI data for each donor (37). Sam- ples were pooled between bilaterally homologous cortical areas. Regional expression levels for each gene were compiled to form a 308×20, 647 regional transcription matrix (37). Since the AHBA only includes data for the right hemisphere for two subjects, in our analyses relating gene expression to MRI data, we only consider intrahemispheric left hemisphere edges (38). 

We used PLS to relate the regional morphometric similarity case–control differences (t statistics from the 152 cortical regions in the left hemisphere calculated from intrahemispheric edges only) to the post mortem gene expression measurements for all 20, 647 genes. PLS uses the gene expression measurements (the predictor variables) to predict the regional morphome- tric similarity case-control t statistics from all three datasets (the response variables). The first PLS component (PLS1) is the linear combination of the weighted gene expression scores that have a cortical expression map that is most strongly correlated with the map of case–control morphometric similarity differences. The statistical significance of the variance explained by PLS1 was tested by permuting the response variables 1,000 times. The error in estimating each gene’s PLS1 weight was assessed by bootstrap- ping (resampling with replacement of the 308 cortical regions), and the ratio of the weight of each gene to its bootstrap SE was used to calculate the Z scores and, hence, rank the genes according to their contribution to PLS1 (6). 

We constructed PPI networks from the genes with PLS1 weights Z>3 and Z<−3 (all FDR<0.05) using STRING version 10.5 (14). Our key results were robust to changing these thresholds to Z>4 and Z<−4 (all FDR<0.01) (SI Appendix, section S8.3). We used DAVID (39, 40) to calculate enrichments of KEGG pathways and GO enrichments of biological processes for genes with Z>3 or Z<−3 using a background gene list of 15,745 brain-expressed genes (SI Appendix, section S8.3) (38).

We used a resampling procedure to test for enrichment of PLS-derived
gene sets by genes previously associated with schizophrenia by transcrip- tional data (15). The median rank of each risk gene set in the PLS gene list was compared with the median rank of 10,000 randomly selected brain-expressed gene sets (3).

In [2]:
###########################################
# Read QSM stat maps
##########################################
MNI = pd.read_csv(os.path.join(ahba_dir,'QSM_TSTATS/MNI_NIFTI_VALUES_permute_10K_OCT2.csv'), index_col = 0 )

In [3]:
###########################################
# Read expression values of AHBA database  
###########################################
#AHBA = pd.read_csv(os.path.join(ahba_dir, 'ahba_data', 'AHBA_20737.csv'), index_col = 0)
AHBA_REANNOT = pd.read_csv(os.path.join(ahba_dir, 'ahba_data', 'AHBA_reannot.csv'), index_col = 0)

In [2]:
nucleus = 'STR3_MOTOR_tstat_CP_1mm'


# create qsm t-stat dataframe
chi = pd.DataFrame(MNI[nucleus].drop(wells, axis =0)).dropna().sort_index()

# Create AHBA dataframe 
genes = AHBA_REANNOT.columns[:-13]
ahba  = AHBA_REANNOT.drop([i for i in AHBA_REANNOT.index if i not in chi.index], axis=0).sort_index()
ahba  = ahba.drop([i for i in ahba.columns if i not in genes], axis=1)

background_genes = pd.read_csv(os.path.join(ahba_dir,'background_gene_list.txt'), sep='\t').GeneSymbol.values
AHBA_BACK = ahba.drop([i for i in ahba.columns if i not in background_genes], axis = 1 )
AHBA_BACK = AHBA_BACK.drop([i for i in AHBA_BACK.index if i not in MNI.STR3_MOTOR_tstat_CP_1mm.dropna()],axis =0)


NameError: name 'MNI' is not defined

In [3]:
print ahba.shape
print chi.shape

NameError: name 'ahba' is not defined

In [6]:
pls1 = pd.read_csv('/Users/kanaaax/Desktop/PLS1_geneWeights2.csv', header=None)
pls2 = pd.read_csv('/Users/kanaaax/Desktop/PLS2_geneWeights2.csv', header=None)

boot=pls1

#fig = plt.figure(figsize=(8, 6))
#ax = fig.add_subplot(111)
#sns.set_style("white")
g = sns.distplot(boot, color='b', hist=1, bins = 30, norm_hist=1, kde_kws={'linewidth': 0},
                hist_kws={"histtype": "barstacked", "linewidth": 0.3,
                          "alpha": 1, "color": "#FA966B"})

#q1=np.percentile(boot, 5)
#q2=np.percentile(boot, 95)
#plt.axvline(q1, color='k', linestyle='-.', linewidth=1)
#plt.axvline(q2, color='k', linestyle='-.', linewidth=1)

#sns.despine(left=1)
print q1,q2
#histtype : {'bar', 'barstacked', 'step', 'stepfilled'}, optional


In [None]:
z5  = [pls1.loc[i][0] for i in pls1.index if pls1.loc[i][2]  < np.percentile(pls1, 5) ]
z95 = [pls1.loc[i][0] for i in pls1.index if pls1.loc[i][2] > np.percentile(pls2, 95)]

print len(z5)
print len(z95)

In [None]:
for i in z95:
    print i

In [None]:
pls1rnk=pls1.set_index(0).drop([1],axis=1)
pls1rnk
pls1rnk.to_csv('/Users/kanaaax/Desktop/pls1rnk.rnk', sep='\t')

In [None]:
pls2rnk=pls2.set_index(0).drop([1],axis=1)
pls2rnk.to_csv('/Users/kanaaax/Desktop/pls2rnk.rnk', sep='\t')