Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
565 lines (486 sloc) 16.5 KB
\documentclass{beamer}
\mode<presentation>
{
\usetheme{classic}
\setbeamercovered{transparent}
}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc}
\usepackage{times}
\usepackage[T1]{fontenc}
\usepackage{url}
\title[CLS]{Back to the Future: Life beyond R, by going back to Lisp}
\subtitle{Using History to Design better data analysis systems}
\author[]
{A.J.~(Tony)~Rossini}
\institute[Novartis Pharma AG and University of Washington]
{
Quantitative Safety and Epidemiology\\
Novartis Pharma AG \\
Basel
\and
Department of Biomedical and Health Informatics\\
University of Washington}
\date[StatComp 2014] % (optional, should be abbreviation of conference name)
{Reisensburg, 2014}
\subject{Statistical Computing Environments}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
\AtBeginSubsection[]
{
\begin{frame}<beamer>{Outline}
\tableofcontents[currentsection,currentsubsection]
\end{frame}
}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}[fragile]{Intro to Lisp notation}
\begin{verbatim}
;; This is a comment
#|
and so is this
|#
'(a list of things to become data)
(list a list of things to become data)
(what-I-execute with-one-thing with-two-thing)
;; that is:
(my-fcn-name input1
input2) ; and to auto-gen input1:
(my-fcn-name (my-fcn-name input3 input4)
input2)
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Consider XML}
\begin{verbatim}
<car brand="honda" engine="4cyl">accord</car>
\end{verbatim}
becomes
\begin{verbatim}
; data follows keywords...
(car :brand 'honda :engine "4cyl" 'accord)
\end{verbatim}
\end{frame}
\section{Common Lisp Statistics}
\label{sec:CLS}
\subsection{Why Common Lisp Statistics}
\label{sec:why}
\begin{frame}{What is CLS?}
\begin{itemize}
\item A collection of Common Lisp packages
\item a component-based structure for statistical computing,
allowing for small and specific specification.
\item a means to drive philosophically customized data analysis, the
incorporation of computable statistical research, and the
enforcement of a structure to allow simple comparisons between
methodologies.
\item This is a ``customization'' of a sophisticated programming
language through packages to support statistical computing, not a
independent language. ``Ala Carte'', not ``Menu''.
\end{itemize}
\end{frame}
\begin{frame}{Current Functionality}
\begin{itemize}
\item basic dataframes (similar to R) in 2 different approaches;
subsetting API under development.
\item Basic regression (similar to XLispStat)
\item matrix storage both in foreign and lisp-centric storage
through lisp-matrix
\item LAPACK (small percentage implemented, increasing), works
across both matrix storage types.
\item static graphics (X11) including preliminary grid functionality
based on CAIRO. Generation of PNG files from graphics windows.
Currently broken due to 3rd party upgrade
\item CSV file support
\item Common Lisp!
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Computational Environment Supported}
\begin{itemize}
\item works on Linux, with recent SBCL versions
\item Definitely works on bleeding edge Debian (unstable).
\item Has worked for weak definitions of ``work'' on 4 different
people's computers (not quite, but sort of requires a
\verb+/home/tony/+ !)
\end{itemize}
\end{frame}
\begin{frame}{Goals}{short term}
\begin{itemize}
\item Better integration of data structures with statistical routines
(auto-handling with dataframes, rather than manual parsing).
\item dataframe to model-matrix tools (leveraging old XlispStat GEE
package)
\end{itemize}
\end{frame}
\begin{frame}{Goals}{Medium/Long Term}
\begin{itemize}
\item Support for other Common Lisps
\item Cleaner front-end API to matrices and numerical algorithms
\item constraint system for different statistical algorithm
development, to support programming interactive GUIs and graphics
\item LispStat semi-compatible (object system works, GUI to do)
\item Integrated invisible parallelization when more efficient
(multicore, threading, and user-space systems)
\end{itemize}
\end{frame}
\subsection{Why Lisp}
\begin{frame}{Why use Common Lisp?}
\begin{itemize}
\item Parens provide clear delineation of a \textbf{Complete
Thought} (functional programming with side effects).
\item Lisp-2 (symbol represents both a function and a variable)
\item ANSI standard (built by committee, but the committee was
smart)
\item Many implementations
\item Most implementations are interactive \textbf{compiled}
languages (few are interpreted and byte-compiled).
\item The Original \emph{Programming with Data} Language
(\emph{Programs are Data} and \emph{Data are Executable} apply).
\item advanced, powerful, first-class macros (macros functionally
re-write code, allowing for structural clarity and complete
destruction of syntax, should that be reasonable)
\end{itemize}
\end{frame}
\begin{frame}{Available Common Lisp Packages}
(They are packages and called packages, not libraries. Some people
can rejoice!)
\begin{itemize}
\item infrastructure \emph{enhancements}: infix-notation, data
structures, control and flow structures
\item numerics, graphics, GUIs,
\item primitive R to CL compiler (which could also be considered an
object-code compiler for R); 3 interfaces which embed R within CL.
\item Web 2.0 support and TeX-like reporting facilities for PDF
output.
\end{itemize}
See \url{http://www.common-lisp.net/} and
\url{http://www.cliki.org/}. CLS sources can be found on
\url{http://github.com/blindglobe/}
\end{frame}
\section{CLS Works?}
\label{sec:work}
\subsection{Status}
\begin{frame}{Is it Vaporware? Not quite}
The follow is possible with the help of the open source Common Lisp
community, who provided most of the packages, tools, and glue.
(Tamas Papp, Raymond Toy, Mark Hoemmomem, and many, many others).
Most of the underlying code was written by others, and ``composed''
by me.
\end{frame}
\subsection{Graphics}
\label{sec:work:graphics}
\begin{frame}{Silly Visualization Example}
\includegraphics[width=3in,height=3in]{./test1.eps} %png
\end{frame}
\begin{frame}[fragile]{How?}
\begin{verbatim}
(defparameter *frame2*
(as-frame (create-xlib-image-context 200 200)
:background-color +white+))
(bind ((#2A((f1 f2) (f3 f4))
(split-frame *frame2*
(percent 50)
(percent 50))))
(defparameter *f1* f1) ; lower left
(defparameter *f2* f2) ; lower right f3 f4
(defparameter *f3* f3) ; top left f1 f2
(defparameter *f4* f4)); top right
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Functions to Plot}
\begin{verbatim}
(plot-function *f1* #'sin
(interval-of 0 2)
:x-title "x" :y-title "sin(x)")
(plot-function *f2* #'cos (interval-of 0 2)
:x-title "x" :y-title "cos(x)")
(plot-function *f3* #'tan (interval-of 0 2)
:x-title "x" :y-title "tan(x)")
\end{verbatim}
\end{frame}
\begin{frame}[fragile]{Things to Plot}
\small{
\begin{verbatim}
(let* ((n 500)
(xs (num-sequence
:from 0 :to 10 :length n))
(ys (map 'vector
#'(lambda (x) (+ x 8 (random 4.0)))
xs))
(weights
(replicate #'(lambda () (1+ (random 10)))
n 'fixnum))
(da (plot-simple *f4*
(interval-of 0 10)
(interval-of 10 20)
:x-title "x" :y-title "y")))
(draw-symbols da xs ys :weights weights))
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{Copying existing graphics}
And we generated the figure on the first page by:
\begin{verbatim}
(xlib-image-context-to-png
(context *f1*)
"/home/tony/test1.png")
\end{verbatim}
\end{frame}
\subsection{Statistical Models}
\label{sec:work:statmod}
\begin{frame}[fragile]{Linear Regression}
\small{
\begin{verbatim}
;; Worse than LispStat, wrapping LAPACK's dgelsy:
(defparameter *result1*
(lm (list->vector-like iron)
(list->vector-like absorbtion)))
*result*1 =>
((#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
-11.504913191235342
0.23525771181009483>
2)
#<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
9.730392177126686e-6 -0.001513787114206932
-0.001513787114206932 0.30357851215706255>
13 2)
\end{verbatim}
}
\end{frame}
\subsection{Data Manip/Mgmt}
\label{sec:work:data}
\begin{frame}[fragile]{DataFrames}
\small{
\begin{verbatim}
(defparameter *my-df-1*
(make-instance 'dataframe-array
:storage #2A((1 2 3 4 5) (10 20 30 40 50))
:doc "This is a boring dataframe-array"
:case-labels (list "x" "y")
:var-labels (list "a" "b" "c" "d" "e")))
(xref *my-df-1* 0 0) ; API change in progress
(setf (xref *my-df-1* 0 0) -1d0)
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{Numerical Matrices}
\small{
\begin{verbatim}
(defparameter *mat-1*
(make-matrix 3 3
:initial-contents #2A((2d0 3d0 -4d0)
(3d0 2d0 -4d0)
(4d0 4d0 -5d0))))
(xref *mat-1* 2 0) ; => 4d0 ; API change
(setf (xref *mat-1* 2 0) -4d0)
(defparameter *xv*
(make-vector 4 :type :row
:initial-contents '((1d0 3d0 2d0 4d0))))
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{Macros make the above tolerable}
\begin{verbatim}
(defparameter *xv*
(make-vector 4 :type :row
:initial-contents '((1d0 3d0 2d0 4d0))))
; can use defmacro for the following syntax =>
(make-row-vector *xv* '((1d0 3d0 2d0 4d0)))
; or reader macros for the following:
#mrv(*xv* '((1d0 3d0 2d0 4d0)))
\end{verbatim}
\end{frame}
\section{Discussion}
\subsection{Conclusions and Outlook}
\begin{frame}{Conclusion}
This slowly developing research program aims to a statistical
computing system which enables sophisticated statistical research
which can be readily transfer to applications, is supportable.
Related numerical/statistical projects:
\begin{itemize}
\item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
on the JVM)
\item FEMLisp : system/workshop for finite-element analysis modeling
using Lisp
\item matlisp/LispLab : LAPACK-based numerical linear algebra packages
\item GSLL : GNU Scientific Library, Lisp interface.
\item RCL, RCLG, CLSR (embedding R within Common Lisp)
\end{itemize}
\end{frame}
\begin{frame}{Why not use R?}
\begin{itemize}
\item the R programming language is incomplete and under constant
redefinition. Common Lisp is standardized (for many years), with
many implementations
\item Application delivery can be tough
\item Without parens, Common Lisp could be R (interactive, or batch,
or through ``compiled applications'').
\item R is the Microsoft of statistical computing.
\item many ``warts'' that R has, can't be fixed due to sizeable user
populations or heavy-weight vested interests.
\item Evolutionary development for R requires strawmen (other
systems) upon which to use for competition.
\end{itemize}
\end{frame}
\begin{frame}{What can you do to follow up?}
\begin{itemize}
\item Learn common lisp, by trying it out and reading:
\begin{itemize}
\item \item Introduction to Common Lisp: Paul Graham's ANSI Common
Lisp, enjoyable book with boring title, best intro to S4 classes
around.
\item Practical Common Lisp, by Peter Seibel
\end{itemize}
\item Get the packages from http://github.com/blindglobe
\item subscribe to the mailing list
\end{itemize}
The next stage of reproducible research will require computable
statistics (code that explains itself and can be parsed to generate
knowledge about its claims; ``XML's promise'').
\end{frame}
\begin{frame}{Thank you for listening!}
\end{frame}
\section{What ought to be coming}
\subsection{Theory}
\begin{frame}[fragile]{Example 1: Theory\ldots}
\label{example1}
Let $f(x;\theta)$ describe the likelihood of XX under the following
assumptions.
\begin{enumerate}
\item assumption-1
\item assumption-2
\end{enumerate}
Then if we use the following algorithm:
\begin{enumerate}
\item step-1
\item step-2
\end{enumerate}
then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
characteristics\ldots
\end{frame}
\begin{frame}
\frametitle{Can we compute, using this description?}
Given the information at hand:
\begin{itemize}
\item we ought to have a framework for initial coding for the
actual simulations (test-first!)
\item the implementation is somewhat clear
\item We should ask: what theorems have similar assumptions?
\item We should ask: what theorems have similar conclusions but
different assumptions?
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Realizing Theory}
\small{
\begin{verbatim}
(define-theorem my-proposed-theorem
(:theorem-type '(distribution-properties
frequentist likelihood))
(:assumes '(assumption-1 assumption-2))
(:likelihood-form
(defun likelihood (data theta gamma)
(exponential-family theta gamma)))
(:compute-by
'(progn
(compute-start-values thetahat gammahat)
(until (convergence)
(setf convergence
(or (step-1 thetahat)
(step-2 gammahat))))))
(:claim (equal-distr '(thetahat gammahat) 'normal))))
\end{verbatim}
}
\end{frame}
\begin{frame}[fragile]{It would be nice to have}
\begin{verbatim}
(theorem-veracity 'my-proposed-theorem)
\end{verbatim}
returning some indication of how well it met given computable claims,
modulo what proportion of computable claims could be tested.
\begin{itemize}
\item and have it run some illustrative simulations which suggest
which might be problematic in real situations, and real situations
for which there are no problems.
\item and work through some of the logic based on related claims using
identical assumptions to confirm some of the results
\end{itemize}
\end{frame}
\begin{frame}[fragile]{and why not...?}
\begin{verbatim}
(when (> (theorem-veracity
'my-proposed-theorem)
0.8)
(make-draft-paper 'my-proposed-theorem
:style :JASA
:output-formats
'(LaTeX MSWord)))
\end{verbatim}
\end{frame}
\begin{frame}{Comments}
\begin{itemize}
\item Of course the general problem is very difficult, but one must
start somewhere.
\item Starting place: basic statistical proof of concepts (in
progress ): T-Test, linear regression (LS-based, Normal-Normal
Bayesian)
\item Areas targetted for medium-term future: resampling methods and
similar algorithms.
\end{itemize}
\end{frame}
\subsection{Practice and Application}
\begin{frame}
\frametitle{Example 2: Practice\ldots}
\label{example2}
The dataset comes from a series of clinical trials, some with active
control and others using placebo control. We model the primary
endpoint, ``relief'', as a binary random variable. There is a
random trial effect on relief as well as severity due to differences
in recruitment and inclusion/exclusion criteria from 2 different
trial networks.
\end{frame}
\begin{frame}
\frametitle{Can we compute, using this description?}
\begin{itemize}
\item With a real such description, it is clear what some of the
potential models might be for this dataset
\item It should be clear how to start thinking of a data dictionary
for this problem.
\end{itemize}
\end{frame}
\begin{frame}[fragile]{Can we compute?}
\begin{verbatim}
(dataset-metadata paper-1
:context 'clinical-trial 'randomized
'active-ctrl 'placebo-ctrl 'metaanalysis
:variables '((relief :model-type dependent
:distr binary)
(trial :model-type independent
:distr categorical)
(disease-severity))
:metadata '(incl-crit-net1 excl-crit-net1
incl-crit-net1 excl-crit-net2
recr-rate-net1 recr-rate-net2))
(propose-analysis paper-1)
; => (list 'tables '(logistic-regression))
\end{verbatim}
\end{frame}
\subsection{Round-trip translation}
\begin{frame}{Example 3: The Round-trip\ldots}
\label{example3}
The first examples describe ``ideas $\rightarrow$ code''
Consider the last time you read someone else's implementation of a
statistical procedure (i.e. R package code). When you read the
code, could you see:
\begin{itemize}
\item the assumptions used?
\item the algorithm implemented?
\item practical guidance for when you might select the algorithm
over others?
\item practical guidance for when you might select the
implementation over others?
\end{itemize}
These are usually components of any reasonable journal article.
\textit{(Q: have you actually read an R package that wasn't yours?)}
\end{frame}
\end{document}