rename fscLVM to slalom

bioFAM · Oct 24, 2017 · 974e968 · 974e968
1 parent b085a5e
commit 974e968
Show file tree

Hide file tree

Showing 21 changed files with 120 additions and 119 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -33,7 +33,7 @@ script:
 - python setup.py sdist
 - pip install dist/`ls dist | grep -i -E '\.(gz)$' | head -1`
 - pushd /
-- python -c "import fscLVM; print(fscLVM.__version__)"
+- python -c "import slalom; print(slalom.__version__)"
 - popd
 notifications:
   email: false
@@ -47,4 +47,4 @@ deploy:
     secure: buXXT5VTU6j0ArBCXLvkLAfEP/qIppuc4SRXQTSf6nkxok5fwmVmvP9CSJQrpgpJG6soL6rZhcowACjboN/YZBvlYLR9KoBu371dqL1AvIw2XFRNUa7EDOCdQpYlt3NviPcVnqF5ZEIfULxTGDlKnhRBU1olZCM/JJCexGcT4NSPGuZLXjMsjFd3uR8WqgvpCfJPXZzSaOqVIslnkIb0ZvTB9Z5sYMDe8xgOXnO5C1+aMeyIQnthxmXPXEQUtntg8YCUenuPfyuUf2gNXrx1W8uqIsmX3ivhR2N5i+s5JniBiTqenfVUrvgDiOolRF7vPA8jU+6ZCZCy0hTdPI2QuAChkb0Uw9CdcPZyL1hS61A3KxJ3AKRAkqCiBlbSH2v8gqjcVGtGL4CfbS9Im0BoUoGvhWqLREzk5Mjnw6Ed2w8QhmwU2cTEY8qVsmZpz3hci1YIHVHEW6W0moV7rj3uW/dfceTbWtA4+yh5IOOu0CIVkgjKHL+3QBSDynqL+EuJQhafn7tI+ow9R3msEVYMway7d1XcJAabIv/0SVQu97NpHYd7KoD2kk5HdyvJPsIu4ugUi0zcVe5EnxG7qVqEAdvcG6MyMRkRLXq/8vjqg7BDVlZeORc/jkTZFsc/6qH/eYX3G/MoIJTDv609SM4mb8pyVbt0hAeJrQqPE8SyNvM=
   on:
     distributions: sdist bdist_wheel
-    repo: PMBio/f-scLVM
+    repo: bioFAM/slalom
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
 include LICENSE
 include README.md
 include setup.cfg
-recursive-include fscLVM *.py
+recursive-include slalom *.py
diff --git a/doc/Makefile b/doc/Makefile
@@ -91,9 +91,9 @@ qthelp:
 	@echo
 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fscLVM.qhcp"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/slalom.qhcp"
 	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fscLVM.qhc"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/slalom.qhc"
 
 .PHONY: applehelp
 applehelp:
@@ -110,8 +110,8 @@ devhelp:
 	@echo
 	@echo "Build finished."
 	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/fscLVM"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fscLVM"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/slalom"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/slalom"
 	@echo "# devhelp"
 
 .PHONY: epub

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# fscLVM documentation build configuration file, created by
+# slalom documentation build configuration file, created by
 # sphinx-quickstart on Tue Nov  1 14:32:47 2016.
 #
 # This file is execfile()d with the current directory set to its
@@ -20,7 +20,7 @@
 import sys
 sys.path.insert(0, os.path.abspath('py/'))
 
-from fscLVM import __version__
+from slalom import __version__
 
 # -- General configuration ------------------------------------------------
 
@@ -60,7 +60,7 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'fscLVM'
+project = u'slalom'
 copyright = u'2016-2017, Florian Buettner and Oliver Stegle'
 author = u'Florian Buettner'
 
@@ -146,7 +146,7 @@
 # The name for this set of Sphinx documents.
 # "<project> v<release> documentation" by default.
 #
-# html_title = u'fscLVM v1.0.0'
+# html_title = u'slalom v1.0.0'
 
 # A shorter title for the navigation bar.  Default is the same as html_title.
 #
@@ -246,7 +246,7 @@
 # html_search_scorer = 'scorer.js'
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'fscLVMdoc'
+htmlhelp_basename = 'slalomdoc'
 
 # -- Options for LaTeX output ---------------------------------------------
 
@@ -272,7 +272,7 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'fscLVM.tex', u'fscLVM Documentation',
+    (master_doc, 'slalom.tex', u'slalom Documentation',
      u'Florian Buettner', 'manual'),
 ]
 
@@ -314,7 +314,7 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'fsclvm', u'fscLVM Documentation',
+    (master_doc, 'slalom', u'slalom Documentation',
      [author], 1)
 ]
 
@@ -329,8 +329,8 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'fscLVM', u'fscLVM Documentation',
-     author, 'fscLVM', 'One line description of project.',
+    (master_doc, 'slalom', u'slalom Documentation',
+     author, 'slalom', 'One line description of project.',
      'Miscellaneous'),
 ]
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -1,9 +1,9 @@
-fscLVM documentation
+slalom documentation
 ======================
 
-f-scLVM is a scalable modelling framework for single-cell RNA-seq data that can be used to dissect and model single-cell transcriptome heterogeneity, thereby allowing to identify biological drivers of cell-to-cell variability and model confounding factors.
+slalom is a scalable modelling framework for single-cell RNA-seq data that can be used to dissect and model single-cell transcriptome heterogeneity, thereby allowing to identify biological drivers of cell-to-cell variability and model confounding factors.
 
-Software by Florian Buettner and Oliver Stegle. f-scLVM is explained in detail in the accompanying publication [1].
+Software by Florian Buettner and Oliver Stegle. slalom is explained in detail in the accompanying publication [1].
 
 
 [1] Buettner, F.,Pratanwanich, N., Marioni, J., Stegle, O. Scalable latent-factor models applied to single-cell RNA-seq data separate biological drivers from confounding effects. Submitted
@@ -12,11 +12,11 @@ Software by Florian Buettner and Oliver Stegle. f-scLVM is explained in detail i
 Installation
 ----------
 
-f-scLVM requires Python 2.7 or newer with
+slalom requires Python 2.7 or newer with
 
 * scipy, h5py, numpy, matplotlib, scikit-learn, re
 
-f-scLVM can be installed via pip with `pip install fscLVM`.
+slalom can be installed via pip with `pip install slalom`.
 For best results, we recommend the Anaconda python distribution (https://anaconda.org).
 
 Quickstart
@@ -25,18 +25,18 @@ Quickstart
 ******
 Input  
 ******
-f-scLVM requires two input files, a gene expression file and an annotation file. The gene expression file is a text file containing the normalised, log-transformed gene expression matrix, with every row corresponding to a cell. Column names should be gene identifiers matching those in the annotation file (i.e. if gene symbols are used in the annotation file, column names should also be gene symbols). Row names are optional and can eg be a known covariate which is used for plotting.
+slalom requires two input files, a gene expression file and an annotation file. The gene expression file is a text file containing the normalised, log-transformed gene expression matrix, with every row corresponding to a cell. Column names should be gene identifiers matching those in the annotation file (i.e. if gene symbols are used in the annotation file, column names should also be gene symbols). Row names are optional and can eg be a known covariate which is used for plotting.
 The annotation file is a text file with every row containing the name of a gene set, followed by the gene identifiers annotated to that gene set. We recommend using annotations such as those published in the REACTOME database or the Molecular signature database (MSigDB) and  provide an annotation file containing annotations from the RECTOME database. The license of MSigDB does not permit redistribution of the raw annotation files. To use MSigDB annotations, please register at http://software.broadinstitute.org/gsea/msigdb, download the hallmark gene sets (gene symbols) and place the file in data folder.
 Both text files can then be loaded using the ``load_text`` function.
 
-NB: f-scLVM works best on a subset of highly variable genes; these can be identified using a variance filter based on a mean-variance trend fitted using spike-in transcripts or endogenous genes. A step-by-step workflow on low-level processing of scRNA-seq data (including gene filtering) can be found here: https://f1000research.com/articles/5-2122/v2
+NB: slalom works best on a subset of highly variable genes; these can be identified using a variance filter based on a mean-variance trend fitted using spike-in transcripts or endogenous genes. A step-by-step workflow on low-level processing of scRNA-seq data (including gene filtering) can be found here: https://f1000research.com/articles/5-2122/v2
 
 
 ********************************
 Model initialisation and fitting
 ********************************
 
-An f-scLVM model can be initialised using the ``initFA`` convenience function. Arguments can be used to specify options, incuding number of unannotated factors (`nHidden`), minimum number of genes in a pathway (`minGenes`), whether to use the fast option by pruning genes (`pruneGenes`), noise model (`noise`) and the data directory (`data_dir`). Once a model is initialised, it can be fit using the ``train`` method.
+An slalom model can be initialised using the ``initFA`` convenience function. Arguments can be used to specify options, incuding number of unannotated factors (`nHidden`), minimum number of genes in a pathway (`minGenes`), whether to use the fast option by pruning genes (`pruneGenes`), noise model (`noise`) and the data directory (`data_dir`). Once a model is initialised, it can be fit using the ``train`` method.
 
 ********************************
 Diagnostics, plotting and saving.
@@ -47,12 +47,12 @@ The ``printDiagnostics`` function can be used to print diagnositcs based on the
 Tutorial
 --------
 
-All steps required to run f-scLVM are illustrated in a jupyter notebook that can be viewed `interactively <http://nbviewer.jupyter.org/github/pmbio/f-scLVM/blob/master/ipynb/f-scLVM.ipynb>`_. 
+All steps required to run slalom are illustrated in a jupyter notebook that can be viewed `interactively <http://nbviewer.jupyter.org/github/pmbio/slalom/blob/master/ipynb/slalom.ipynb>`_. 
 
 
-The factorial single-cell latent variable model (f-scLVM)
+The factorial single-cell latent variable model (slalom)
 ---------------------------------------------------------
-A detailed statistical description of the f-scLVM model can be found in teh accompanyin publicaiton [1]. Here, a brief summary is given. f-scLVM is based on a variant of matrix factorization, decomposing the observed gene matrix into a sum of sum of contributions from  A annotated factors, whose inference is guided by pathway gene sets, and H additional unannotated factors:
+A detailed statistical description of the slalom model can be found in teh accompanyin publicaiton [1]. Here, a brief summary is given. slalom is based on a variant of matrix factorization, decomposing the observed gene matrix into a sum of sum of contributions from  A annotated factors, whose inference is guided by pathway gene sets, and H additional unannotated factors:
 
 .. math::
 
@@ -65,31 +65,31 @@ For the statistical derivation in the accompanying publication and the implement
 
     \mathbf{Y} = \mathbf{X}\mathbf{W}^T +\mathbf{\psi} .
 
-We employ two levels of regularization on the parts of the weight matrix :math:`\mathbf{W}` corresponding to annotated factors. First, gene sets are used to guide a spike-and-slab prior on the rows of :math:`\mathbf{W}` thereby confining the inferred weights to the set of genes annotated in the pathway database. To this end :math:`\mathbf{W}` is modelled as elementwise product of a Bernoulli random variable :math:`\mathbf{Z}`, indicating whether a gene is active for a given factor and a Gaussian random variable :math:`\widetilde{\mathbf{W}}`, quantifying the corresponding effect size (for details see [1]). A second level of regularization is then used to achieve sparseness on the level of factors, allowing the model to deactivate factors that are not needed to explain variation in the data; this is achieved using an automatic relevance determination (ARD) prior (i.e. factor-specific Gamma prior on the precision of the weights). The inverse of this ARD prior (:math:`1/\alpha_k`) can be interpreted as a measure of the regulatory impact of  factor :math:`k` and corresponds to the expected variance explained by this factor, for the subset of genes with a regulatory effect. It is therefore also referred to as relevance parameter.  The fscLVM software implements an efficent deterministic approximate Bayesian inference scheme based on variational methods, allowing for the inference of :math:`\mathbf{X}`, :math:`\mathbf{Z}`, :math:`\widetilde{\mathbf{W}}`, :math:`\mathbf{\alpha}`, :math:`\mathbf{\psi}` and other parameters.  
+We employ two levels of regularization on the parts of the weight matrix :math:`\mathbf{W}` corresponding to annotated factors. First, gene sets are used to guide a spike-and-slab prior on the rows of :math:`\mathbf{W}` thereby confining the inferred weights to the set of genes annotated in the pathway database. To this end :math:`\mathbf{W}` is modelled as elementwise product of a Bernoulli random variable :math:`\mathbf{Z}`, indicating whether a gene is active for a given factor and a Gaussian random variable :math:`\widetilde{\mathbf{W}}`, quantifying the corresponding effect size (for details see [1]). A second level of regularization is then used to achieve sparseness on the level of factors, allowing the model to deactivate factors that are not needed to explain variation in the data; this is achieved using an automatic relevance determination (ARD) prior (i.e. factor-specific Gamma prior on the precision of the weights). The inverse of this ARD prior (:math:`1/\alpha_k`) can be interpreted as a measure of the regulatory impact of  factor :math:`k` and corresponds to the expected variance explained by this factor, for the subset of genes with a regulatory effect. It is therefore also referred to as relevance parameter.  The slalom software implements an efficent deterministic approximate Bayesian inference scheme based on variational methods, allowing for the inference of :math:`\mathbf{X}`, :math:`\mathbf{Z}`, :math:`\widetilde{\mathbf{W}}`, :math:`\mathbf{\alpha}`, :math:`\mathbf{\psi}` and other parameters.  
 
 Loading data and model initialisation
 -------------------------------------
 
-.. autofunction:: fscLVM.load_txt
-.. autofunction:: fscLVM.load_hdf5
-.. autofunction:: fscLVM.initFA
-.. autofunction:: fscLVM.preTrain
+.. autofunction:: slalom.load_txt
+.. autofunction:: slalom.load_hdf5
+.. autofunction:: slalom.initFA
+.. autofunction:: slalom.preTrain
 
 
 
 Model fitting
 -------------------------------------
-.. autoclass:: fscLVM.core.CSparseFA
+.. autoclass:: slalom.core.CSparseFA
     :members:
 
 
 Plotting and saving results
 ----------------------------
-.. autofunction:: fscLVM.plotTerms
-.. autofunction:: fscLVM.plotFactors
-.. autofunction:: fscLVM.plotRelevance
-.. autofunction:: fscLVM.saveFA
-.. autofunction:: fscLVM.dumpFA
+.. autofunction:: slalom.plotTerms
+.. autofunction:: slalom.plotFactors
+.. autofunction:: slalom.plotRelevance
+.. autofunction:: slalom.saveFA
+.. autofunction:: slalom.dumpFA
 
 Contents:
 

diff --git a/doc/make.bat b/doc/make.bat
@@ -129,9 +129,9 @@ if "%1" == "qthelp" (
 	echo.
 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 .qhcp project file in %BUILDDIR%/qthelp, like this:
-	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fscLVM.qhcp
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\slalom.qhcp
 	echo.To view the help file:
-	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fscLVM.ghc
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\slalom.ghc
 	goto end
 )
 

diff --git a/ipynb/f-scLVM.ipynb b/ipynb/f-scLVM.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# fscLVM tutorial "
+    "# slalom tutorial "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this notebook we illustrate how f-scLVM can be used to identify biological drivers on the mESC cell cycle staged dataset."
+    "In this notebook we illustrate how slalom can be used to identify biological drivers on the mESC cell cycle staged dataset."
    ]
   },
   {
@@ -38,9 +38,9 @@
    ],
    "source": [
     "import os\n",
-    "import fscLVM\n",
+    "import slalom\n",
     "import pdb\n",
-    "from fscLVM import plotFactors, plotRelevance, plotLoadings, saveFA, dumpFA\n",
+    "from slalom import plotFactors, plotRelevance, plotLoadings, saveFA, dumpFA\n",
     "%pylab inline\n",
     "\n",
     "#specify where the hdf5 file is\n",
@@ -53,7 +53,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "f-scLVM expects an expression file, typically with log transformed gene expression values  as well as a gene set annotation. These data can be provided as single hdf5 file, which can be generated using separate R scripts (in the R folder). Alternatively, the expression matrix and the annotation can be loaded as text files in python. \n",
+    "slalom expects an expression file, typically with log transformed gene expression values  as well as a gene set annotation. These data can be provided as single hdf5 file, which can be generated using separate R scripts (in the R folder). Alternatively, the expression matrix and the annotation can be loaded as text files in python. \n",
     "\n",
     "\n",
     "\n",
@@ -94,11 +94,11 @@
     "####\n",
     "#Option 1: load a pre-defined hdf5 file\n",
     "#We provide an (optional) hdf file with the required data - this was generated using\n",
-    "#the R script write_fscLVM.R in the R folder\n",
+    "#the R script write_slalom.R in the R folder\n",
     "if 0:\n",
     "    annoDB = 'MSigDB'\n",
     "    dFile = os.path.join(data_dir,'Buettneretal2015.hdf5')\n",
-    "    data = fscLVM.load_hdf5(dFile, anno=annoDB)\n",
+    "    data = slalom.load_hdf5(dFile, anno=annoDB)\n",
     "    \n",
     "####\n",
     "\n",
@@ -116,12 +116,12 @@
     "\n",
     "    #dataFile: csv file with log expresison values\n",
     "    dataFile = os.path.join(data_dir,'Buettneretal.csv.gz') # note that the first column (row names) contains the cell cycle stage in numeric form\n",
-    "    data = fscLVM.utils.load_txt(dataFile=dataFile,annoFiles=annoFile,annoDBs=annoDB)\n",
+    "    data = slalom.utils.load_txt(dataFile=dataFile,annoFiles=annoFile,annoDBs=annoDB)\n",
     "####\n",
     "\n",
     "###alternatively the data can be loaded from the provided hdf5 file\n",
     "#dFile = 'Buettneretal2015.hdf5'\n",
-    "#data = fscLVM.load_hdf5(dFile, data_dir=data_dir)\n",
+    "#data = slalom.load_hdf5(dFile, data_dir=data_dir)\n",
     "\n",
     "\n",
     "#print statistics for the loaded dataset\n",
@@ -135,7 +135,7 @@
     "collapsed": false
    },
    "source": [
-    "# Initializing the f-scLVM model"
+    "# Initializing the slalom model"
    ]
   },
   {
@@ -149,9 +149,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/flo/projects/Auto_Bionf/scLVM2/fscLVM/core.py:392: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "/Users/flo/projects/Auto_Bionf/scLVM2/slalom/core.py:392: RuntimeWarning: divide by zero encountered in true_divide\n",
       "  logPi = SP.log(self.Pi[:,m]/(1-self.Pi[:,m]))\n",
-      "/Users/flo/projects/Auto_Bionf/scLVM2/fscLVM/core.py:394: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "/Users/flo/projects/Auto_Bionf/scLVM2/slalom/core.py:394: RuntimeWarning: divide by zero encountered in true_divide\n",
       "  logPi = SP.log(self.Pi[:,m]/(1-self.Pi[:,m]))\n"
      ]
     }
@@ -168,7 +168,7 @@
     "gene_ids = data['genes']\n",
     "\n",
     "#initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors\n",
-    "FA = fscLVM.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15)"
+    "FA = slalom.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15)"
    ]
   },
   {