added the scripts from data-sciecne-toolbox

appsecco · Sep 6, 2016 · b0dc901 · b0dc901
1 parent 1d916f7
commit b0dc901
Show file tree

Hide file tree

Showing 21 changed files with 859 additions and 0 deletions.
diff --git a/scripts/Rio b/scripts/Rio
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout
+#
+# Example usage:
+# $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)')
+#
+# $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv
+# $ < iris.csv Rio -e 'df$SepalLength^2'
+# $ < iris.csv Rio -f summary
+# $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")'
+# $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png
+#
+# Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, and sqldf)
+#
+# Author: http://jeroenjanssens.com
+
+usage() {
+cat << EOF
+Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout
+
+usage: Rio OPTIONS
+
+OPTIONS:
+   -d      Delimiter
+   -e      Commands to execute
+   -f      Single command to execute on data.frame
+   -h      Show this message
+   -g      Import ggplot2
+   -n      CSV has no header
+   -r      Import dplyr and tidyr
+   -s      Import sqldf
+   -b      Use same settings as used for book Data Science at the Command Line
+   -v      Verbose
+
+EOF
+}
+
+finish() {
+	rm -f $IN $OUT ${OUT%.png} ${ERR%.err}
+
+        ## Removes error file if error file is empty.
+    if [[ ! -s $ERR ]]; then
+        rm -f $ERR
+    fi
+
+	rm -f Rplots.pdf
+}
+
+trap finish EXIT
+
+callR() {
+	Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}"
+}
+
+SCRIPT=
+REQUIRES=
+DELIMITER=","
+HEADER="T"
+VERBOSE=false
+
+# OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional.
+# This explicitly uses a template, which works for both.  The $TMPDIR is in case
+# it isn't set as an enviroment variable, assumes you have /tmp.
+if [ -z "$TMPDIR" ]; then
+    TMPDIR=/tmp/
+fi
+IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX)
+OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png
+ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err
+
+while getopts "d:hgnprsve:f:b" OPTION
+do
+	case $OPTION in
+		b)
+			RIO_DPI=300
+			;;
+		d)
+			DELIMITER=$OPTARG
+			;;
+		e)
+			SCRIPT=$OPTARG
+			if ! echo $SCRIPT | grep -qe "; *$"
+			then
+				SCRIPT="${SCRIPT};"
+			fi
+			;;
+		f)
+			SCRIPT="${OPTARG}(df);"
+			;;
+		h)
+			usage
+			exit 1
+			;;
+		g)
+			REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);"
+			;;
+		n)
+			HEADER="F"
+			;;
+		r)
+			REQUIRES="${REQUIRES}require(dplyr);require(tidyr);"
+			;;
+		s)
+			REQUIRES="${REQUIRES}require(sqldf);"
+			;;
+		v)
+			VERBOSE=true
+			;;
+		?)
+			usage
+			exit
+		;;
+	esac
+done
+
+cat /dev/stdin > $IN
+
+if $VERBOSE
+then
+	callR
+else
+	callR > $ERR 2>&1
+fi
+
+if [[ ! -f $OUT ]]; then
+	cat $ERR
+else
+	RESULT="$(cat $OUT)"
+	if [ "$RESULT" == "NULL" ]; then
+		cat $ERR
+	else
+		cat $OUT
+	fi 
+fi
+
diff --git a/scripts/Rio-mds b/scripts/Rio-mds
@@ -0,0 +1,2 @@
+#!/bin/bash
+Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])'
diff --git a/scripts/Rio-pca b/scripts/Rio-pca
@@ -0,0 +1,2 @@
+#!/bin/bash
+Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])'
diff --git a/scripts/Rio-scatter b/scripts/Rio-scatter
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Rio-scatter: create scatter plot from CSV
+# 
+# Default colour is 1 (blue)
+#
+# Example usage:
+# curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display
+#
+# Dependency: Rio
+#
+# Author: Jeroen Janssens (http://jeroenjanssens.com)
+
+X="$1"
+Y="$2"
+COLOR="${3:-1}"
+Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))"
diff --git a/scripts/arff2csv b/scripts/arff2csv
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+weka core.converters.CSVSaver -i /dev/stdin
diff --git a/scripts/body b/scripts/body
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+#
+# body: apply expression to all but the first line.
+# Use multiple times in case the header spans more than one line.
+# 
+# Example usage:
+# $ seq 10 | header -a 'values' | body sort -nr
+# $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr
+#
+# From: http://unix.stackexchange.com/a/11859
+#
+# See also: header (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
+IFS= read -r header
+printf '%s\n' "$header"
+eval $@
diff --git a/scripts/cols b/scripts/cols
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# cols: apply a command to a subset of the columns and merge back with the remaining columns.
+#
+# Assumes that the input data is comma-delimited and that it has a header.
+# Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org
+# 
+# Example usage 1: reverse sort column 'a'
+# $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr
+#
+# Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set:
+# $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species
+# 
+# See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
+#
+# Author: http://jeroenjanssens.com
+
+ARG="$1"
+ARG_INV="$(tr cC Cc <<< ${ARG})"
+shift
+COLUMNS="$1"
+shift
+EXPR="$@"
+
+finish() {
+	rm -f $OTHER_COLUMNS
+}
+trap finish EXIT
+
+if [ -z "$TMPDIR" ]; then
+    TMPDIR=/tmp
+fi
+OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX)
+
+tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS)
diff --git a/scripts/csv2arff b/scripts/csv2arff
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+weka core.converters.CSVLoader /dev/stdin
diff --git a/scripts/csv2vw b/scripts/csv2vw
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+from sys import stdin, stdout, stderr, exit
+import itertools
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        epilog="""If both --classes and --auto-relabel are omitted,
+        label values are left as-is. By default, features with value 0 are not
+        printed. This can be overridden with --null""",
+        usage="""%(prog)s [OPTION]... [FILE]
+
+Convert CSV to Vowpal Wabbit input format.
+
+  Examples:
+
+  # Leave label values as is:
+  $ csv2vw spam.csv --label target
+
+  # Relabel values 'ham' to 0 and 'spam' to 1:
+  $ csv2vw spam.csv --label target --classes ham,spam
+
+  # Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss):
+  $ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one
+
+  # Relabel first label value to 0, second to 1, and ignore the rest:
+  $ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes
+
+  # Relabel first label value to 1, second to 2, and so on:
+  $ <iris.csv csv2vw -lspecies --multiclass --auto-relabel
+
+  # Relabel 'versicolor' to 1, 'virginica' to 2, and 'setosa' to 3
+  $ <iris.csv csv2vw -lspecies --multiclass -cversicolor,virginica,setosa""")
+
+    parser.add_argument("file", nargs="?", type=argparse.FileType("r"),
+                        default=stdin,
+                        help="""Input CSV file. If omitted,
+                        read from standard input.""",
+                        metavar="FILE")
+    parser.add_argument("-d", "--delimiter",
+                        help="""Delimiting character of the input CSV file
+                        (default: ,).""",
+                        default=",")
+    parser.add_argument("-l", "--label",
+                        help="""Name of column that contains the class
+                        labels.""")
+    parser.add_argument("-c", "--classes",
+                        help="""Ordered, comma-separated list of possible
+                        class labels to relabel. If not specifying all possible
+                        class labels, use --auto-relabel.""",
+                        nargs="?")
+    parser.add_argument("-n", "--null",
+                        help="""Comma-separated list of null values (default:
+                        '0').""",
+                        nargs="?", default="0")
+    parser.add_argument("-a", "--auto-relabel",
+                        help="""Automatically relabel class labels in the order
+                        in which they appear in the CSV file.""",
+                        action="store_true")
+    parser.add_argument("-m", "--multiclass",
+                        help="""Indicates more than two classes; will start
+                        counting at 1 instead of 0.""",
+                        action="store_true")
+    parser.add_argument("-+", "--minus-plus-one",
+                        help="""Instead of relabeling to integers, relabel to
+                        '-1' and '+1'. Needed when using VW with logistic or
+                        hinge loss.""", action="store_true")
+    parser.add_argument("-i", "--ignore-extra-classes",
+                        help="""If there are more than two classes found, when
+                        not using --multiclass, include the example with no
+                        label instead of giving skipping it.""",
+                        action="store_true")
+    parser.add_argument("-t", "--tag",
+                        help="""Name of column that contains the tags.""")
+
+    args = parser.parse_args()
+
+    auto_relabel = args.auto_relabel
+    label_column = args.label
+    tag_column = args.tag
+    null_values = args.null.split(",")
+    multiclass = args.multiclass
+    minus_plus_one = args.minus_plus_one
+
+    if minus_plus_one:
+        new_classes = iter(["-1", "+1"])
+    elif multiclass:
+        new_classes = (str(i) for i in itertools.count(1))
+    elif args.classes or auto_relabel:
+        new_classes = iter(["0", "1"])
+    else:
+        new_classes = None
+
+    if args.classes:
+        old_classes = args.classes.split(",")
+        relabel = dict(zip(old_classes, new_classes))
+    else:
+        relabel = dict()
+
+    reader = csv.DictReader(args.file, delimiter=args.delimiter)
+    try:
+        for row in reader:
+            label = row.pop(label_column, "")
+            tag = row.pop(tag_column, "")
+
+            if auto_relabel or new_classes:
+                if auto_relabel:
+                    if label not in relabel:
+                        try:
+                            relabel[label] = next(new_classes)
+                        except StopIteration:
+                            if args.ignore_extra_classes:
+                                relabel[label] = ""
+                            else:
+                                stderr.write("Found too many different classes;"
+                                             " skipping example. Use "
+                                             "--multiclass or "
+                                             "--ignore-extra-classes.\n")
+                                continue
+                label = relabel[label]
+
+            features = " ".join([k + ":" + v for k, v in sorted(row.items())
+                                 if v not in null_values])
+            line = label + " " + tag + "| " + features + "\n"
+            stdout.write(line)
+            stdout.flush()
+    except (IOError, KeyboardInterrupt, BrokenPipeError):
+        stderr.close()
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/scripts/drake b/scripts/drake
@@ -0,0 +1,2 @@
+#!/bin/bash
+drip -cp ${DRAKEPATH}/drake.jar drake.core "$@"
diff --git a/scripts/dseq b/scripts/dseq
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# dseq: generate sequence of dates relative to today.
+#
+# Usage: dseq LAST
+#    or: dseq FIRST LAST
+#    or: dseq FIRST INCREMENT LAST
+#
+# Example usage:
+# $ dseq 1       # tomorrow
+# $ dseq 0 0     # today
+# $ dseq 7       # next 7 days
+# $ dseq -2 0	 # day before yesterday till today
+# $ dseq 1 7 365 # tomorrow and then every week for a year
+#
+# Author: Jeroen Janssens
+
+seq -f "%g day" "$@" | date --file - +%F