# Elastic nets on the Paul data set

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [3]:
import sklearn.linear_model as sklm

In [4]:
import scanpy.api as sc

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, color_map='viridis')  # low dpi (dots per inch) yields small inline figures
sc.logging.print_versions()

scanpy==1.3.2+25.g8ac9c03 anndata==0.6.11 numpy==1.14.6 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.0 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


### Load data

In [5]:
adata = sc.datasets.paul15()



... storing 'paul15_clusters' as categorical


In [6]:
lookup = list(adata.obs['paul15_clusters'].cat.categories)
yVec = np.array([lookup.index( adata.obs['paul15_clusters'][i] ) for i in range(adata.obs['paul15_clusters'].shape[0]) ])

In [7]:
folds = np.load("paul15-scviFolds.npz")['folds']

In [8]:
folds.shape

(5, 546)

### Create the elastic net objects

This alpha list was discovered after some manual testing with elastic nets.  It should run pretty quickly and rarely fail to converge.

In [9]:
alphaList = np.linspace(0.05, 100*0.05, num=100, endpoint=True)

In [11]:
en = sklm.ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=5, max_iter=10000, n_jobs=2, alphas=alphaList)

### Test the Enets on cluster 0 in  fold 0

It takes about 2 minutes to train this.  This means about an hour per fold.

In [12]:
mask = np.zeros(adata.X.shape[0],dtype=bool)
mask[folds[0]] = True

In [13]:
%%time
en.fit(adata.X[~mask], (yVec[~mask]==0)*1)

CPU times: user 2min 32s, sys: 1min 4s, total: 3min 36s
Wall time: 1min 49s


ElasticNetCV(alphas=array([0.05, 0.1 , ..., 4.95, 5.  ]), copy_X=True, cv=5,
       eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=10000,
       n_alphas=100, n_jobs=2, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

The selected markers:

In [21]:
np.nonzero(en.coef_)[0]

array([ 552,  553, 1212, 1351, 1421, 1422, 2025, 2384])

See the selected markers in order:

In [15]:
tempMarks = np.flipud(np.argsort(np.abs(en.coef_)))

In [16]:
tempMarks

array([1421, 2025, 1212, ..., 2294, 2293,    0])

In [35]:
currMarks = []
currCoefs = []

ind = 0
while en.coef_[tempMarks[ind]] != 0:
    currMarks.append(tempMarks[ind])
    currCoefs.append(en.coef_[tempMarks[ind]])
        
    ind += 1

In [36]:
currMarks

[1421, 2025, 1212, 553, 552, 1351, 1422, 2384]

In [37]:
for mark in np.nonzero(en.coef_)[0]:
    if mark not in currMarks:
        print("ERROR: Marker {} was not duplicated".format(mark))

## Train the enets on a fold

Specify `foldN` for the fold that you desire.  Some training and convergence information is included below (from the time that I ran the jobs).

In [9]:
%%time
alphaList = np.linspace(0.05, 100*0.05, num=100, endpoint=True)

foldN = 4
mask = np.zeros(adata.X.shape[0],dtype=bool)
mask[folds[foldN]] = True

allMarks = []
allCoefs = []

for clust in np.unique(yVec):
    print("Working on cluster {}".format(clust))
    en = sklm.ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=5, max_iter=10000, n_jobs=2, alphas=alphaList)
    
    en.fit(adata.X[~mask], (yVec[~mask]==clust)*1)
    
    tempMarks = np.flipud(np.argsort(np.abs(en.coef_)))
    
    currMarks = []
    currCoefs = []
    
    ind = 0
    while en.coef_[tempMarks[ind]] != 0:
        currMarks.append(tempMarks[ind])
        currCoefs.append(en.coef_[tempMarks[ind]])
        
        ind += 1

    allMarks.append(np.array(currMarks))
    allCoefs.append(np.array(allCoefs))

Working on cluster 0
Working on cluster 1




Working on cluster 2




Working on cluster 3
Working on cluster 4




Working on cluster 5




Working on cluster 6
Working on cluster 7
Working on cluster 8
Working on cluster 9
Working on cluster 10
Working on cluster 11
Working on cluster 12




Working on cluster 13




Working on cluster 14
Working on cluster 15
Working on cluster 16
Working on cluster 17
Working on cluster 18
CPU times: user 1h 23min 27s, sys: 35min 17s, total: 1h 58min 45s
Wall time: 47min 2s


Some timing and convergence information:
* Fold 0 took 52 min 10s on 3 cores.  We had 18 cases where the convergence failed.
* Fold 1 took 1hr 46 min on 2 cores (wall time 54min 16s).  We had 12 cases where the convergence failed.
* Fold 2 took 1hr 53 min (wall time 1hr 10min).  We had 19 cases where the convergence failed.
* Fold 3 took 2hr 13 min (wall time 1hr 9min).  We had 23 cases where the convergence failed.
* Fold 4 took 1hr 23 min (wall time 47min).  We had 20 cases where the convergence failed.

In [19]:
[a.shape[0] for a in allMarks]

[12,
 233,
 185,
 67,
 43,
 118,
 276,
 111,
 110,
 86,
 27,
 96,
 169,
 99,
 55,
 78,
 9,
 1,
 22]

Save the data

In [10]:
np.savez("paul15-nets-fold{}-marks".format(foldN), allMarks)

In [11]:
np.savez("paul15-nets-fold{}-coefs".format(foldN), allCoefs)

In [42]:
stuff = np.load("paul15-nets-fold0-marks.npz")['arr_0']

In [44]:
list(stuff)

[array([1421, 2025, 1212,  553,  552, 1351, 1422, 2384]),
 array([1735, 1121,  814, 1978, 2384, 1421, 1422, 3199, 3283, 3261,  532,
        2388,  483, 1072, 1597,  582,  769,  782, 1212, 1500, 2912, 2129,
        2563, 1058,  778,   58,  709, 2856, 2044, 1215, 2102, 1020, 1012,
        3248, 2910, 1213,  561,  689,  881, 1032, 2796, 3226,  439,  494,
        2105, 2940, 1502, 1039, 2054, 1860,  392,  469, 1066, 2025,  398,
         887, 2640,  111, 1888, 2624, 1400, 1670,  362,  797, 2478,  600,
        2634, 3428, 2436,  472,  631,  148, 2407, 1351, 2639, 3374, 2949,
        2361,  553,  271, 1783, 2677, 3152, 2930,  209,  446, 1656,  994,
        2263,  883,  766,   18,  604, 2636,  833, 1340, 1454,  196,  653,
        2068,  534,  788,  248, 3230,  552, 2093, 1800,  413, 1720, 3250,
         396, 3000, 2435,  309, 1028, 2294,  596, 2872, 1973, 3380,   62,
        2958, 2807, 1429,  998, 2628, 2190,  285, 1081,  170,  549, 2742,
         729,  302, 1040, 2645, 2425, 1476, 2556,  454