# A simple demo of the package

In [None]:
##conda create -n rrr glmnet_py scanpy jupyterlab pandas scikit-learn openpyxl xlrd

In [None]:
##%matplotlib notebook
%run ../notebooks_trials_py/paper_00_config.ipynb
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pickle
import sparseRRR

In [None]:
import glmnet_python

In [None]:
def preprocess(data,normalize='cpm'):
    X = data['counts'][:,data['mostVariableGenes']] / np.sum(data['counts'], axis=1)
    if normalize=='cpm':
        X *= 1e+6
    elif normalize=='median':
        X *= np.median(np.array(np.sum(data['counts'],axis=1)))
    X = np.array(X)
    X = np.log2(X + 1)
    X = X - np.mean(X, axis=0)
    X = X / np.std(X, axis=0)

    Y = data['ephys']
    Y = Y - np.mean(Y, axis=0)
    Y = Y / np.std(Y, axis=0)
    
    return (X,Y)

In [None]:
def format_scala2020():
    data = pickle.load(open('data/scala2020.pickle', 'rb'))
    data["counts"]=data["counts"].todense()
    cl=sorted(set(data['ttype']))
    data['colors']=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('deep',len(cl)).colors]))
    return data

In [None]:
def format_scala2019():
    data = pickle.load(open('data/scala2019.pickle', 'rb'))
    data["counts"]=data["counts"].todense()
    #data['ttype']=data['regions'] 
    data['ttype']=data['layers']
    cl=sorted(set(data['ttype']))
    data['colors']=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('tab20',len(cl)).colors]))
    return data

In [None]:
def format_cadwell():
    data = pickle.load(open('data/cadwell2016.pickle', 'rb'))
    data["counts"]=data["counts"].todense()
    data['ttype']=data['cellTypes']
    genes = data['genes'][data['mostVariableGenes']]
    cl=sorted(set(data['ttype']))
    data['colors']=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('tab20',len(cl)).colors]))
    return data

In [None]:
def format_gouwen():
    data = pickle.load(open('data/gouwens2020.pickle', 'rb'))
    data["counts"]=data["counts"].todense().astype(np.float64)##np.asarray(np.expm1(data['counts']).astype(int))
    genes = data['genes'][data['mostVariableGenes']]
    cl=sorted(set(data['ttype']))
    data['colors']=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('tab20',len(cl)).colors]))
    return data

In [None]:
def format_fuzik():
    data = pickle.load(open('data/fuzik2016.pickle', 'rb'))
    data["counts"]=data["counts"].todense()
    data['ttype']=data['cluster']
    cl=sorted(set(data['ttype']))
    data['colors']=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('tab20',len(cl)).colors]))
    return data

In [None]:
## the next cell assumes that you have preprocessed anndata objects for your rnaseq data.

In [None]:
ephy_reduced_features=['IV_resistance', 'IV_baseline', 'IV_first_spike_amplitude', 'IV_first_spike_threshold',
 'IV_first_spike_thr2ahp', 'IV_first_spike_delay', 'IV_first_spike_half_width','IV_first_spike_max_fall_slope', 
 'IV_first_spike_max_rise_slope','IV_max_nb_spikes', 'IV_rebound_spikes_(-50.0 pA)', 'IV_rheobase',
 'IV_sagratio_I=-80.0 pA', 'IV_tc', 'IV_rheo_nb_spikes','SYN_evt_overallfreq', 'SYN_evt_wtc_mean',
  'IV_first_spike_interval', 'IV_max_duration', 'IV_gain', 'IV_sfa_freq_log',
  'IV_sfa_peak_log', 'RAMP_iorect_40_over_120',
    ]
ephy_reduced_features2=['IV_resistance', 
                       'IV_baseline', 
                       #'IV_first_spike_amplitude', 
                       'IV_first_spike_threshold',
                       'IV_max_freq',
                       'IV_mAHP_min',
                       'IV_first_spike_thr2ahp', 
                       #'IV_first_spike_delay', 
                       'IV_first_spike_half_width',
                        #'IV_first_spike_max_fall_slope', 
                        #'IV_first_spike_max_rise_slope',
                        #'IV_max_nb_spikes', 
                       'IV_rheobase',
                        'IV_sagratio_I=-80.0 pA', 'IV_tc',
                        #'IV_first_spike_interval', 
                        #'IV_max_duration', #'IV_gain', #'IV_sfa_freq_log',
                        #'IV_sfa_peak_log', #'RAMP_iorect_40_over_120',
                       'SYN_evt_ampl_mean'
                      ]
import anndata
def format_pseq(colorby='groups'):
    rna=anndata.read_h5ad("rna.h5ad")
    ephy=anndata.read_h5ad(f"{mtxdir}/h5ad/pain/pain_ephy.h5ad")
    ephy=ephy[rna.obs.index.to_list(),ephy_reduced_features]
    ## discard some cells with missing features. or indrf
    notna=ephy.to_df().dropna(axis=0).index.to_list()
    #notna=[x for x in notna if x.endswith('s')]
    ephy=ephy[notna,:].copy()
    rna=rna[notna,:].copy()
    rna=rna[:,np.std(rna.X,axis=0)>0].copy()
    
    cl=sorted(set(rna.obs[colorby]))
    colors=dict( zip(cl, [mpl.colors.rgb2hex(c) for c in mpl.cm.get_cmap('tab10',len(cl)).colors]))
    data={
        'counts':np.asmatrix(rna.X),
        'cells':ephy.obs_names.to_numpy(),
        'genes':rna.var_names.to_numpy(),
        'mostVariableGenes':rna.var["hvg"],
        'ephys':ephy.X,
        'ephysNames':ephy.var_names.to_numpy(),
        'ttype':rna.obs[colorby].to_numpy(),
        'colors': colors
           }
    return data

In [None]:
data=format_pseq()

In [None]:
assert( np.sum(np.std(data["counts"], axis=0)==0)==0 )

In [None]:
X,Y = preprocess(data)
X=X.astype(np.float64)
Y=Y.astype(np.float64)
genes = data['genes'][data['mostVariableGenes']]

print('Shape of X:', X.shape, '\nShape of Y:', Y.shape)

In [None]:
%time 
#w,v = sparseRRR.relaxed_elastic_rrr(X, Y, rank=2, lambdau=.4, alpha=1)
#w,v = sparseRRR.relaxed_elastic_rrr(X, Y, rank=2, lambdau=.4, alpha=0.2)
w,v = sparseRRR.relaxed_elastic_rrr(X, Y, rank=2, lambdau=0.4, alpha=0.75)

print('\nGenes selected: {}'.format(np.sum(w[:,0]!=0)))
print(', '.join(genes[w[:,0]!=0]))
#%matplotlib inline

sparseRRR.bibiplot(X, Y, w, v, 
                   titles = ['RNA expression', 'Electrophysiology'],
                   cellTypes = data['ttype'], 
                   cellTypeColors = data['colors'], 
                   YdimsNames = data['ephysNames'], 
                   XdimsNames = genes)

In [None]:
lambdas = np.concatenate((np.arange(.2,1.01,.1), np.arange(2,10)))
alphas = np.array([.25, .5, .75, 1])

cvResults = sparseRRR.elastic_rrr_cv(X, Y, rank=4,alphas=alphas, lambdas=lambdas)

sparseRRR.plot_cv_results(*cvResults, alphas)

In [None]:
##%time bootCounts = sparseRRR.elastic_rrr_bootstrap(X, Y, lambdau=.4, alpha=1)
%time bootCounts = sparseRRR.elastic_rrr_bootstrap(X, Y, lambdau=0.2, alpha=1.0)

args = np.argsort(bootCounts)[::-1]
print('')
for i in range(20):
    print('{:10} {:.2f}'.format(genes[args[i]], bootCounts[args[i]]))

In [None]:
# A quick and dirty way to estimate the dimensionality

%time sparseRRR.dimensionality(X, Y)

In [None]:
for i in range(40):
    print('{:10} {:.2f}'.format(genes[args[i]], bootCounts[args[i]]))