From b0dc9018b1c0d02103ba06c976cce8df9351dfd4 Mon Sep 17 00:00:00 2001
From: Madhu Akula <madhu.akula@hotmail.com>
Date: Tue, 6 Sep 2016 14:45:56 +0530
Subject: [PATCH] added the scripts from data-sciecne-toolbox

---
 scripts/Rio          | 135 +++++++++++++++++++++++++++++++++++++++++++
 scripts/Rio-mds      |   2 +
 scripts/Rio-pca      |   2 +
 scripts/Rio-scatter  |  17 ++++++
 scripts/arff2csv     |   2 +
 scripts/body         |  15 +++++
 scripts/cols         |  34 +++++++++++
 scripts/csv2arff     |   2 +
 scripts/csv2vw       | 134 ++++++++++++++++++++++++++++++++++++++++++
 scripts/drake        |   2 +
 scripts/dseq         |  17 ++++++
 scripts/dumbplot     |  83 ++++++++++++++++++++++++++
 scripts/explain      |  25 ++++++++
 scripts/header       |  92 +++++++++++++++++++++++++++++
 scripts/pbc          |  10 ++++
 scripts/sample       |  94 ++++++++++++++++++++++++++++++
 scripts/scrape       |  70 ++++++++++++++++++++++
 scripts/servewd      |   3 +
 scripts/unpack       |  44 ++++++++++++++
 scripts/weka         |  65 +++++++++++++++++++++
 scripts/weka-cluster |  11 ++++
 21 files changed, 859 insertions(+)
 create mode 100755 scripts/Rio
 create mode 100755 scripts/Rio-mds
 create mode 100755 scripts/Rio-pca
 create mode 100755 scripts/Rio-scatter
 create mode 100755 scripts/arff2csv
 create mode 100755 scripts/body
 create mode 100755 scripts/cols
 create mode 100755 scripts/csv2arff
 create mode 100755 scripts/csv2vw
 create mode 100755 scripts/drake
 create mode 100755 scripts/dseq
 create mode 100755 scripts/dumbplot
 create mode 100755 scripts/explain
 create mode 100755 scripts/header
 create mode 100755 scripts/pbc
 create mode 100755 scripts/sample
 create mode 100755 scripts/scrape
 create mode 100755 scripts/servewd
 create mode 100755 scripts/unpack
 create mode 100755 scripts/weka
 create mode 100755 scripts/weka-cluster

diff --git a/scripts/Rio b/scripts/Rio
new file mode 100755
index 0000000..badf4bd
--- /dev/null
+++ b/scripts/Rio
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout
+#
+# Example usage:
+# $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)')
+#
+# $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv
+# $ < iris.csv Rio -e 'df$SepalLength^2'
+# $ < iris.csv Rio -f summary
+# $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")'
+# $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png
+#
+# Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, and sqldf)
+#
+# Author: http://jeroenjanssens.com
+
+usage() {
+cat << EOF
+Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout
+
+usage: Rio OPTIONS
+
+OPTIONS:
+   -d      Delimiter
+   -e      Commands to execute
+   -f      Single command to execute on data.frame
+   -h      Show this message
+   -g      Import ggplot2
+   -n      CSV has no header
+   -r      Import dplyr and tidyr
+   -s      Import sqldf
+   -b      Use same settings as used for book Data Science at the Command Line
+   -v      Verbose
+
+EOF
+}
+
+finish() {
+	rm -f $IN $OUT ${OUT%.png} ${ERR%.err}
+
+        ## Removes error file if error file is empty.
+    if [[ ! -s $ERR ]]; then
+        rm -f $ERR
+    fi
+
+	rm -f Rplots.pdf
+}
+
+trap finish EXIT
+
+callR() {
+	Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}"
+}
+
+SCRIPT=
+REQUIRES=
+DELIMITER=","
+HEADER="T"
+VERBOSE=false
+
+# OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional.
+# This explicitly uses a template, which works for both.  The $TMPDIR is in case
+# it isn't set as an enviroment variable, assumes you have /tmp.
+if [ -z "$TMPDIR" ]; then
+    TMPDIR=/tmp/
+fi
+IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX)
+OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png
+ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err
+
+while getopts "d:hgnprsve:f:b" OPTION
+do
+	case $OPTION in
+		b)
+			RIO_DPI=300
+			;;
+		d)
+			DELIMITER=$OPTARG
+			;;
+		e)
+			SCRIPT=$OPTARG
+			if ! echo $SCRIPT | grep -qe "; *$"
+			then
+				SCRIPT="${SCRIPT};"
+			fi
+			;;
+		f)
+			SCRIPT="${OPTARG}(df);"
+			;;
+		h)
+			usage
+			exit 1
+			;;
+		g)
+			REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);"
+			;;
+		n)
+			HEADER="F"
+			;;
+		r)
+			REQUIRES="${REQUIRES}require(dplyr);require(tidyr);"
+			;;
+		s)
+			REQUIRES="${REQUIRES}require(sqldf);"
+			;;
+		v)
+			VERBOSE=true
+			;;
+		?)
+			usage
+			exit
+		;;
+	esac
+done
+
+cat /dev/stdin > $IN
+
+if $VERBOSE
+then
+	callR
+else
+	callR > $ERR 2>&1
+fi
+
+if [[ ! -f $OUT ]]; then
+	cat $ERR
+else
+	RESULT="$(cat $OUT)"
+	if [ "$RESULT" == "NULL" ]; then
+		cat $ERR
+	else
+		cat $OUT
+	fi 
+fi
+
diff --git a/scripts/Rio-mds b/scripts/Rio-mds
new file mode 100755
index 0000000..6ab1a10
--- /dev/null
+++ b/scripts/Rio-mds
@@ -0,0 +1,2 @@
+#!/bin/bash
+Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])'
diff --git a/scripts/Rio-pca b/scripts/Rio-pca
new file mode 100755
index 0000000..68f1c4d
--- /dev/null
+++ b/scripts/Rio-pca
@@ -0,0 +1,2 @@
+#!/bin/bash
+Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])'
diff --git a/scripts/Rio-scatter b/scripts/Rio-scatter
new file mode 100755
index 0000000..5ad40fc
--- /dev/null
+++ b/scripts/Rio-scatter
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Rio-scatter: create scatter plot from CSV
+# 
+# Default colour is 1 (blue)
+#
+# Example usage:
+# curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display
+#
+# Dependency: Rio
+#
+# Author: Jeroen Janssens (http://jeroenjanssens.com)
+
+X="$1"
+Y="$2"
+COLOR="${3:-1}"
+Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))"
diff --git a/scripts/arff2csv b/scripts/arff2csv
new file mode 100755
index 0000000..32f00e6
--- /dev/null
+++ b/scripts/arff2csv
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+weka core.converters.CSVSaver -i /dev/stdin
diff --git a/scripts/body b/scripts/body
new file mode 100755
index 0000000..7b106a9
--- /dev/null
+++ b/scripts/body
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+#
+# body: apply expression to all but the first line.
+# Use multiple times in case the header spans more than one line.
+# 
+# Example usage:
+# $ seq 10 | header -a 'values' | body sort -nr
+# $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr
+#
+# From: http://unix.stackexchange.com/a/11859
+#
+# See also: header (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
+IFS= read -r header
+printf '%s\n' "$header"
+eval $@
diff --git a/scripts/cols b/scripts/cols
new file mode 100755
index 0000000..6f68153
--- /dev/null
+++ b/scripts/cols
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# cols: apply a command to a subset of the columns and merge back with the remaining columns.
+#
+# Assumes that the input data is comma-delimited and that it has a header.
+# Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org
+# 
+# Example usage 1: reverse sort column 'a'
+# $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr
+#
+# Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set:
+# $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species
+# 
+# See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
+#
+# Author: http://jeroenjanssens.com
+
+ARG="$1"
+ARG_INV="$(tr cC Cc <<< ${ARG})"
+shift
+COLUMNS="$1"
+shift
+EXPR="$@"
+
+finish() {
+	rm -f $OTHER_COLUMNS
+}
+trap finish EXIT
+
+if [ -z "$TMPDIR" ]; then
+    TMPDIR=/tmp
+fi
+OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX)
+
+tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS)
diff --git a/scripts/csv2arff b/scripts/csv2arff
new file mode 100755
index 0000000..ac3f365
--- /dev/null
+++ b/scripts/csv2arff
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+weka core.converters.CSVLoader /dev/stdin
diff --git a/scripts/csv2vw b/scripts/csv2vw
new file mode 100755
index 0000000..e929b76
--- /dev/null
+++ b/scripts/csv2vw
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+from sys import stdin, stdout, stderr, exit
+import itertools
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        epilog="""If both --classes and --auto-relabel are omitted,
+        label values are left as-is. By default, features with value 0 are not
+        printed. This can be overridden with --null""",
+        usage="""%(prog)s [OPTION]... [FILE]
+
+Convert CSV to Vowpal Wabbit input format.
+
+  Examples:
+
+  # Leave label values as is:
+  $ csv2vw spam.csv --label target
+
+  # Relabel values 'ham' to 0 and 'spam' to 1:
+  $ csv2vw spam.csv --label target --classes ham,spam
+
+  # Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss):
+  $ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one
+
+  # Relabel first label value to 0, second to 1, and ignore the rest:
+  $ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes
+
+  # Relabel first label value to 1, second to 2, and so on:
+  $ <iris.csv csv2vw -lspecies --multiclass --auto-relabel
+
+  # Relabel 'versicolor' to 1, 'virginica' to 2, and 'setosa' to 3
+  $ <iris.csv csv2vw -lspecies --multiclass -cversicolor,virginica,setosa""")
+
+    parser.add_argument("file", nargs="?", type=argparse.FileType("r"),
+                        default=stdin,
+                        help="""Input CSV file. If omitted,
+                        read from standard input.""",
+                        metavar="FILE")
+    parser.add_argument("-d", "--delimiter",
+                        help="""Delimiting character of the input CSV file
+                        (default: ,).""",
+                        default=",")
+    parser.add_argument("-l", "--label",
+                        help="""Name of column that contains the class
+                        labels.""")
+    parser.add_argument("-c", "--classes",
+                        help="""Ordered, comma-separated list of possible
+                        class labels to relabel. If not specifying all possible
+                        class labels, use --auto-relabel.""",
+                        nargs="?")
+    parser.add_argument("-n", "--null",
+                        help="""Comma-separated list of null values (default:
+                        '0').""",
+                        nargs="?", default="0")
+    parser.add_argument("-a", "--auto-relabel",
+                        help="""Automatically relabel class labels in the order
+                        in which they appear in the CSV file.""",
+                        action="store_true")
+    parser.add_argument("-m", "--multiclass",
+                        help="""Indicates more than two classes; will start
+                        counting at 1 instead of 0.""",
+                        action="store_true")
+    parser.add_argument("-+", "--minus-plus-one",
+                        help="""Instead of relabeling to integers, relabel to
+                        '-1' and '+1'. Needed when using VW with logistic or
+                        hinge loss.""", action="store_true")
+    parser.add_argument("-i", "--ignore-extra-classes",
+                        help="""If there are more than two classes found, when
+                        not using --multiclass, include the example with no
+                        label instead of giving skipping it.""",
+                        action="store_true")
+    parser.add_argument("-t", "--tag",
+                        help="""Name of column that contains the tags.""")
+
+    args = parser.parse_args()
+
+    auto_relabel = args.auto_relabel
+    label_column = args.label
+    tag_column = args.tag
+    null_values = args.null.split(",")
+    multiclass = args.multiclass
+    minus_plus_one = args.minus_plus_one
+
+    if minus_plus_one:
+        new_classes = iter(["-1", "+1"])
+    elif multiclass:
+        new_classes = (str(i) for i in itertools.count(1))
+    elif args.classes or auto_relabel:
+        new_classes = iter(["0", "1"])
+    else:
+        new_classes = None
+
+    if args.classes:
+        old_classes = args.classes.split(",")
+        relabel = dict(zip(old_classes, new_classes))
+    else:
+        relabel = dict()
+
+    reader = csv.DictReader(args.file, delimiter=args.delimiter)
+    try:
+        for row in reader:
+            label = row.pop(label_column, "")
+            tag = row.pop(tag_column, "")
+
+            if auto_relabel or new_classes:
+                if auto_relabel:
+                    if label not in relabel:
+                        try:
+                            relabel[label] = next(new_classes)
+                        except StopIteration:
+                            if args.ignore_extra_classes:
+                                relabel[label] = ""
+                            else:
+                                stderr.write("Found too many different classes;"
+                                             " skipping example. Use "
+                                             "--multiclass or "
+                                             "--ignore-extra-classes.\n")
+                                continue
+                label = relabel[label]
+
+            features = " ".join([k + ":" + v for k, v in sorted(row.items())
+                                 if v not in null_values])
+            line = label + " " + tag + "| " + features + "\n"
+            stdout.write(line)
+            stdout.flush()
+    except (IOError, KeyboardInterrupt, BrokenPipeError):
+        stderr.close()
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/scripts/drake b/scripts/drake
new file mode 100755
index 0000000..4179c90
--- /dev/null
+++ b/scripts/drake
@@ -0,0 +1,2 @@
+#!/bin/bash
+drip -cp ${DRAKEPATH}/drake.jar drake.core "$@"
diff --git a/scripts/dseq b/scripts/dseq
new file mode 100755
index 0000000..a25941a
--- /dev/null
+++ b/scripts/dseq
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# dseq: generate sequence of dates relative to today.
+#
+# Usage: dseq LAST
+#    or: dseq FIRST LAST
+#    or: dseq FIRST INCREMENT LAST
+#
+# Example usage:
+# $ dseq 1       # tomorrow
+# $ dseq 0 0     # today
+# $ dseq 7       # next 7 days
+# $ dseq -2 0	 # day before yesterday till today
+# $ dseq 1 7 365 # tomorrow and then every week for a year
+#
+# Author: Jeroen Janssens
+
+seq -f "%g day" "$@" | date --file - +%F
diff --git a/scripts/dumbplot b/scripts/dumbplot
new file mode 100755
index 0000000..caf1437
--- /dev/null
+++ b/scripts/dumbplot
@@ -0,0 +1,83 @@
+#!/bin/bash
+# dumbplot: Output plot on the terminal given list of X,Y coordinates.
+#   Can either be a scatter plot or a bar (or bar-like) plot that
+#   assumes that the y-coordinate is numeric and just displays the
+#   x-coordinate data as-is, and in the order that it is fed to the
+#   scripts.
+#
+# Dependency: gnuplot
+#
+# Author: http://jeroenjanssens.com
+
+usage () {
+cat << EOF
+dumbplot: Use gnuplot to quickly get ascii plot of x-y data.
+
+usage: dumbplot OPTIONS
+
+OPTIONS:
+  -a      As-is.  Use the x-coord (first coord) as categorical data for plotting.
+  -b      Boxplot.  Use vertical boxes rather than just a marker.  Autosets -a switch.
+  -w      Terminal width do use.  Defaults to actual terminal width.
+  -v      Terminal height to use.  Defaults to terminal height / 2.
+  -h      Show this message
+
+Example usage:
+    $ paste -d, <(echo 1 2 4 8 9 | tr ' ' '\n') <(echo 1 2 4 2 1 | tr ' ' '\n') | dumbplot
+    $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -a
+    $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b
+    $ paste -d, <(echo a b c d e | tr ' ' '\n') <(echo 1 2 4 10 6 | tr ' ' '\n') | dumbplot -b -w 80 -v 22
+
+EOF
+}
+
+
+
+
+ASIS=
+BARCHART=
+PWIDTH=$(tput cols)
+PHEIGHT=$(tput lines)
+
+PHEIGHT=$(echo $PHEIGHT / 2 | bc)
+
+while getopts "abw:v:" OPTION
+do
+	case $OPTION in
+		a)
+			ASIS=1
+			;;
+		b)
+			ASIS=1
+			BARCHART=1
+			;;
+		w)
+            PWIDTH=$OPTARG
+			;;
+		v)
+            PHEIGHT=$OPTARG
+			;;
+		h)
+			usage
+			exit 1
+			;;
+	esac
+done
+
+
+
+
+## Decision logic to execute right gnuplot command
+if [ ! -z "$ASIS" ] ; then
+    if [ ! -z "$BARCHART" ] ; then
+        # Categorical data, uses boxes in plot
+        nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-" using 1:3:xtic(2) with boxes'
+    else
+        # Categorical data, do NOT use boxes
+        nl -s, -f nl | gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-" using 1:3:xtic(2)'
+    fi
+else
+    # Scatterplot of data
+    gnuplot -e "set term dumb size $PWIDTH,$PHEIGHT" -e 'set datafile separator ","; set nokey;  plot "-"'
+fi
+
diff --git a/scripts/explain b/scripts/explain
new file mode 100755
index 0000000..1bb1be2
--- /dev/null
+++ b/scripts/explain
@@ -0,0 +1,25 @@
+#!/bin/bash
+# explain: Command-line wrapper for explainshell.com
+# 
+# Example usage: explain tar xzvf 
+#
+# Dependency: scrape (from: https://github.com/jeroenjanssens/data-science-toolbox)
+#
+# Author: http://jeroenjanssens.com
+
+
+COMMAND="$@"
+URL="http://explainshell.com/explain?cmd=${COMMAND}"
+SYSTEM=$(uname)
+
+if [[ "$SYSTEM" == "Linux" ]]
+then
+  curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -re 's/<(\/?)[^>]*>//g'
+elif [[ "$SYSTEM" == "Darwin" ]]
+then
+  curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -Ee 's/<(\/?)[^>]*>//g'
+fi
+
+
+
+
diff --git a/scripts/header b/scripts/header
new file mode 100755
index 0000000..add268e
--- /dev/null
+++ b/scripts/header
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+# header: add, replace, and delete header lines.
+# 
+# Example usage:
+# $ seq 10 | header -a 'values'
+# $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"'
+# $ seq 10 | header -a 'values' | header -d
+# $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_"
+#
+# See also: body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
+#
+# Author: http://jeroenjanssens.com
+
+usage () {
+cat << EOF
+header: add, replace, and delete header lines.
+
+usage: header OPTIONS
+
+OPTIONS:
+  -n      Number of lines to consider as header [default: 1]
+  -a      Add header
+  -r      Replace header
+  -e      Apply expression to header
+  -d      Delete header
+  -h      Show this message
+
+Example usage:
+  $ seq 10 | header -a 'values'
+  $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"'
+  $ seq 10 | header -a 'values' | header -d
+  $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_"
+
+See also: body
+EOF
+}
+
+get_header () {
+	for i in $(seq $NUMROWS); do
+		IFS= read -r LINE
+		OLDHEADER="${OLDHEADER}${LINE}\n"
+	done
+}
+
+print_header () {
+	echo -ne "$1"
+}
+
+print_body () {
+	cat
+}
+
+OLDHEADER=
+NUMROWS=1
+
+while getopts "dn:ha:r:e:" OPTION
+do
+	case $OPTION in
+		n)
+			NUMROWS=$OPTARG
+			;;
+		a)
+			print_header "$OPTARG\n"
+			print_body
+			exit 1
+			;;
+		d)
+			get_header
+			print_body
+			exit 1
+			;;
+		r)
+			get_header
+			print_header "$OPTARG\n"
+			print_body
+			exit 1
+			;;
+		e)
+			get_header
+			print_header "$(echo -ne $OLDHEADER | eval $OPTARG)\n"
+			print_body
+			exit 1
+			;;
+		h)
+			usage
+			exit 1
+			;;
+	esac
+done
+
+get_header
+print_header "${OLDHEADER}"
diff --git a/scripts/pbc b/scripts/pbc
new file mode 100755
index 0000000..6aaee21
--- /dev/null
+++ b/scripts/pbc
@@ -0,0 +1,10 @@
+#!/bin/bash
+# pbc: parallel bc. First column of input CSV is mapped to {1}, second to {2}, and so forth.
+#
+# Example usage: paste -d, <(seq 100) <(seq 100 -1 1) | ./pbc 'sqrt({1}*{2})'
+#
+# Dependency: GNU parallel
+#
+# Author: http://jeroenjanssens.com
+
+parallel -C, -k -j100% "echo '$1' | bc -l"
diff --git a/scripts/sample b/scripts/sample
new file mode 100755
index 0000000..50c72c3
--- /dev/null
+++ b/scripts/sample
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# sample: Output lines from stdin to stdout with a given probability,
+# for a given duration, and with a given delay between lines.
+# 
+# Example usage: seq 100 | sample -r 20% -d 1000
+#
+# Dependency: Python 2.5
+# 
+# Author: http://jeroenjanssens.com
+
+import os
+import argparse
+
+from random import random
+from time import time, sleep
+from sys import stdin, stdout
+from datetime import datetime, timedelta
+
+def total_seconds(delta):
+   return delta.seconds + (24 * 3600 * delta.days)
+
+def main():
+    parser = argparse.ArgumentParser(description=(
+    "Output lines from stdin to stdout with a given probability "
+    "for a given duration, and with a given delay between lines."))
+    parser.add_argument('file', nargs='?', type=argparse.FileType('rb'),
+        default=stdin, help="File", metavar="FILE")
+    parser.add_argument('-W', '--weeks', type=float, default=0,
+        help="Duration of sampling in weeks")
+    parser.add_argument('-D', '--days', type=float, default=0,
+        help="Duration of sampling in days")
+    parser.add_argument('-H', '--hours', type=float, default=0,
+        help="Duration of sampling in hours")
+    parser.add_argument('-m', '--minutes', type=float, default=0,
+        help="Duration of sampling in minutes")
+    parser.add_argument('-s', '--seconds', type=float, default=0,
+        help="Duration of sampling in seconds")
+    parser.add_argument('-t', '--milliseconds', type=float, default=0,
+        help="Duration of sampling in milliseconds")
+    parser.add_argument('-u', '--microseconds', type=float, default=0,
+        help="Duration of sampling in microseconds")
+    parser.add_argument('-r', '--rate', default='100%',
+        help="Rate between 0 and 1 using either 0.33, 33%%, 1/3 notation.")
+    parser.add_argument('-d', '--delay', default=0, type=int,
+        help="Time in milliseconds between each line of output")
+    args = parser.parse_args()
+
+    invalid_rate_msg = ("Invalid rate. Please specify a rate between 0"
+                        " and 1 using either 0.33, 33%, 1/3 notation.")
+
+    try:
+        delay = float(args.delay) / 1000.0
+    except ValueError:
+        parser.error("Invalid delay. Please specify a delay in ms.")
+
+    try:
+        duration = total_seconds(timedelta(weeks=args.weeks, days=args.days,
+        hours=args.hours, minutes=args.minutes, seconds=args.seconds,
+        milliseconds=args.milliseconds, microseconds=args.microseconds))
+    except:
+        parser.error("Invalid duration.")
+
+    try:
+        if '%' in args.rate:
+            rate = float(args.rate[:-1]) / 100.0
+        elif '/' in args.rate:
+            a, b = map(float, args.rate.split('/')[:2])
+            rate = a / (1.0*b)
+        else:
+            rate = float(args.rate)
+    except ValueError:
+        parser.error(invalid_rate_msg)
+
+    if rate <= 0 or rate > 1:
+        parser.error(invalid_rate_msg)
+
+    start = time()
+    try:
+        while True:
+            line = args.file.readline()
+            if not line:
+                return
+            if random() <= rate:
+                stdout.write(line)
+                stdout.flush()
+                now = time()
+                if duration and (now-start) > duration:
+                    return
+                sleep(delay)
+    except:
+        pass
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/scripts/scrape b/scripts/scrape
new file mode 100755
index 0000000..60de259
--- /dev/null
+++ b/scripts/scrape
@@ -0,0 +1,70 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+# scrape: Extract HTML elements using an XPath query or CSS3 selector.
+#
+# Example usage:
+# $ curl 'http://en.wikipedia.org/wiki/List_of_sovereign_states' -s \
+# | scrape -be 'table.wikitable > tr > td > b > a'
+#
+# Dependencies: lxml and optionally cssselector
+#
+# Author: http://jeroenjanssens.com
+
+import sys
+import argparse
+from lxml import etree
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('html', nargs='?', type=argparse.FileType('rb'),
+                        default=sys.stdin, help="HTML", metavar="HTML")
+    parser.add_argument('-a', '--argument', default="",
+                        help="argument to extract from tag")
+    parser.add_argument('-b', '--body', action='store_true', default=False,
+                        help="Enclose output with HTML and BODY tags")
+    parser.add_argument('-e', '--expression', default='*',
+                        help="XPath query or CSS3 selector")
+    parser.add_argument('-r', '--rawinput', action='store_true', default=False,
+                        help="Do not parse HTML before feeding etree (useful"
+                        "for escaping CData)")
+    args = parser.parse_args()
+
+    args.expression = args.expression.decode('utf-8')
+
+    if not args.expression.startswith('//'):
+        from cssselect import GenericTranslator, SelectorError
+        try:
+            expression = GenericTranslator().css_to_xpath(args.expression)
+        except SelectorError:
+            parser.error('Invalid CSS selector')
+    else:
+        expression = args.expression
+
+    html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
+                                   strip_cdata=True)
+    if args.rawinput:
+        document = etree.fromstring(args.html.read())
+    else:
+        document = etree.parse(args.html, html_parser)
+
+    if args.body:
+        sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n")
+
+    for e in document.xpath(expression):
+        try:
+            if not args.argument:
+                text = etree.tostring(e)
+            else:
+                text = e.get(args.argument)
+            if text is not None:
+                sys.stdout.write(text.encode('utf-8') + "\n")
+            sys.stdout.flush()
+        except IOError:
+            pass
+
+    if args.body:
+        sys.stdout.write("</body>\n</html>\n")
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/scripts/servewd b/scripts/servewd
new file mode 100755
index 0000000..4e7e324
--- /dev/null
+++ b/scripts/servewd
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+ARGS="$@"
+python -m SimpleHTTPServer ${ARGS}
diff --git a/scripts/unpack b/scripts/unpack
new file mode 100755
index 0000000..20b8b40
--- /dev/null
+++ b/scripts/unpack
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# unpack: Extract common file formats
+ 
+# Dependencies: unrar, unzip, p7zip-full
+
+# Author: Patrick Brisbin
+# From: http://linuxtidbits.wordpress.com/2009/08/04/week-of-bash-scripts-extract/
+
+# Display usage if no parameters given
+if [[ -z "$@" ]]; then
+	echo " ${0##*/} <archive> - extract common file formats)"
+	exit
+fi
+ 
+# Required program(s)
+req_progs=(7z unrar unzip)
+for p in ${req_progs[@]}; do
+	hash "$p" 2>&- || \
+	{ echo >&2 " Required program \"$p\" not installed."; exit 1; }
+done
+ 
+# Test if file exists
+if [ ! -f "$@" ]; then
+	echo "File "$@" doesn't exist"
+	exit
+fi
+ 
+# Extract file by using extension as reference
+case "$@" in
+	*.7z ) 7z x "$@" ;;
+	*.tar.bz2 ) tar xvjf "$@" ;;
+	*.bz2 ) bunzip2 "$@" ;;
+	*.deb ) ar vx "$@" ;;
+	*.tar.gz ) tar xvf "$@" ;;
+	*.gz ) gunzip "$@" ;;
+	*.tar ) tar xvf "$@" ;;
+	*.tbz2 ) tar xvjf "$@" ;;
+	*.tar.xz ) tar xvf "$@" ;;
+	*.tgz ) tar xvzf "$@" ;;
+	*.rar ) unrar x "$@" ;;
+	*.zip ) unzip "$@" ;;
+	*.Z ) uncompress "$@" ;;
+	* ) echo " Unsupported file format" ;;
+esac
diff --git a/scripts/weka b/scripts/weka
new file mode 100755
index 0000000..1f31d35
--- /dev/null
+++ b/scripts/weka
@@ -0,0 +1,65 @@
+#!/bin/bash
+# weka: run Weka from the command-line
+#
+# Weka can be obtained from http://www.cs.waikato.ac.nz/ml/weka/downloading.html
+# Make sure that WEKAPATH is set to the full path that contains weka.jar in your .bashrc or .zshrc
+# The snippets below enable tab completion in Bash and Zsh, respectively.
+# 
+# Author: Jeroen Janssens (http://jeroenjanssens.com)
+#
+# See csv2arff and arff2csv for two examples
+
+java -Xmx1024M -cp ${WEKAPATH}/weka.jar "weka.$@"
+
+#########################################################
+# Tab completion for Bash                               #
+#########################################################
+# 
+# export WEKAPATH="/home/joe/bin/"
+#
+# weka-classes () {
+# 	unzip -l $WEKAPATH/weka.jar |
+# 	sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' |
+# 	tr '/' '.'
+# }
+# 
+# weka-folders () {
+# 	unzip -l $WEKAPATH/weka.jar |
+# 	sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' |
+# 	tr '/' '.'
+# }
+# 
+# _completeweka() {
+#   local curw=${COMP_WORDS[COMP_CWORD]}
+#   local wordlist=$(weka-folders; weka-classes)
+#   COMPREPLY=($(compgen -W '${wordlist[@]}' -- "$curw"))
+#   return 0
+# }
+# 
+# complete -o nospace -F _completeweka weka
+#
+#########################################################
+# Tab completion for Zsh                                #
+#########################################################
+#
+# export WEKAJAR="/home/joe/bin/weka.jar"
+# 
+# weka-classes () {
+# 	unzip -l $WEKAJAR |
+# 	sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' |
+# 	tr '/' '.'
+# }
+# 
+# weka-folders () {
+# 	unzip -l $WEKAJAR |
+# 	sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' |
+# 	tr '/' '.'
+# }
+# 
+# function _completeweka {
+# 	reply=($(weka-folders; weka-classes))
+# }
+# 
+# compctl -K _completeweka weka
+#
+#########################################################
diff --git a/scripts/weka-cluster b/scripts/weka-cluster
new file mode 100755
index 0000000..a056e98
--- /dev/null
+++ b/scripts/weka-cluster
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+ALGO="$@"
+IN=$(mktemp --tmpdir weka-cluster-XXXXXXXX).arff
+
+finish () {
+	rm -f $IN
+}
+trap finish EXIT
+
+csv2arff > $IN
+weka filters.unsupervised.attribute.AddCluster -W "weka.${ALGO}" -i $IN -o /dev/stdout | arff2csv