Skip to content

Commit

Permalink
added the scripts from data-sciecne-toolbox
Browse files Browse the repository at this point in the history
  • Loading branch information
madhuakula committed Sep 6, 2016
1 parent 1d916f7 commit b0dc901
Show file tree
Hide file tree
Showing 21 changed files with 859 additions and 0 deletions.
135 changes: 135 additions & 0 deletions scripts/Rio
@@ -0,0 +1,135 @@
#!/bin/bash
# Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout
#
# Example usage:
# $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)')
#
# $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv
# $ < iris.csv Rio -e 'df$SepalLength^2'
# $ < iris.csv Rio -f summary
# $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")'
# $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png
#
# Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, and sqldf)
#
# Author: http://jeroenjanssens.com

usage() {
cat << EOF
Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout
usage: Rio OPTIONS
OPTIONS:
-d Delimiter
-e Commands to execute
-f Single command to execute on data.frame
-h Show this message
-g Import ggplot2
-n CSV has no header
-r Import dplyr and tidyr
-s Import sqldf
-b Use same settings as used for book Data Science at the Command Line
-v Verbose
EOF
}

finish() {
rm -f $IN $OUT ${OUT%.png} ${ERR%.err}

## Removes error file if error file is empty.
if [[ ! -s $ERR ]]; then
rm -f $ERR
fi

rm -f Rplots.pdf
}

trap finish EXIT

callR() {
Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}"
}

SCRIPT=
REQUIRES=
DELIMITER=","
HEADER="T"
VERBOSE=false

# OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional.
# This explicitly uses a template, which works for both. The $TMPDIR is in case
# it isn't set as an enviroment variable, assumes you have /tmp.
if [ -z "$TMPDIR" ]; then
TMPDIR=/tmp/
fi
IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX)
OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png
ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err

while getopts "d:hgnprsve:f:b" OPTION
do
case $OPTION in
b)
RIO_DPI=300
;;
d)
DELIMITER=$OPTARG
;;
e)
SCRIPT=$OPTARG
if ! echo $SCRIPT | grep -qe "; *$"
then
SCRIPT="${SCRIPT};"
fi
;;
f)
SCRIPT="${OPTARG}(df);"
;;
h)
usage
exit 1
;;
g)
REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);"
;;
n)
HEADER="F"
;;
r)
REQUIRES="${REQUIRES}require(dplyr);require(tidyr);"
;;
s)
REQUIRES="${REQUIRES}require(sqldf);"
;;
v)
VERBOSE=true
;;
?)
usage
exit
;;
esac
done

cat /dev/stdin > $IN

if $VERBOSE
then
callR
else
callR > $ERR 2>&1
fi

if [[ ! -f $OUT ]]; then
cat $ERR
else
RESULT="$(cat $OUT)"
if [ "$RESULT" == "NULL" ]; then
cat $ERR
else
cat $OUT
fi
fi

2 changes: 2 additions & 0 deletions scripts/Rio-mds
@@ -0,0 +1,2 @@
#!/bin/bash
Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])'
2 changes: 2 additions & 0 deletions scripts/Rio-pca
@@ -0,0 +1,2 @@
#!/bin/bash
Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])'
17 changes: 17 additions & 0 deletions scripts/Rio-scatter
@@ -0,0 +1,17 @@
#!/usr/bin/env bash

# Rio-scatter: create scatter plot from CSV
#
# Default colour is 1 (blue)
#
# Example usage:
# curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display
#
# Dependency: Rio
#
# Author: Jeroen Janssens (http://jeroenjanssens.com)

X="$1"
Y="$2"
COLOR="${3:-1}"
Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))"
2 changes: 2 additions & 0 deletions scripts/arff2csv
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
weka core.converters.CSVSaver -i /dev/stdin
15 changes: 15 additions & 0 deletions scripts/body
@@ -0,0 +1,15 @@
#!/usr/bin/env bash
#
# body: apply expression to all but the first line.
# Use multiple times in case the header spans more than one line.
#
# Example usage:
# $ seq 10 | header -a 'values' | body sort -nr
# $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr
#
# From: http://unix.stackexchange.com/a/11859
#
# See also: header (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
IFS= read -r header
printf '%s\n' "$header"
eval $@
34 changes: 34 additions & 0 deletions scripts/cols
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
# cols: apply a command to a subset of the columns and merge back with the remaining columns.
#
# Assumes that the input data is comma-delimited and that it has a header.
# Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org
#
# Example usage 1: reverse sort column 'a'
# $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr
#
# Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set:
# $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species
#
# See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science)
#
# Author: http://jeroenjanssens.com

ARG="$1"
ARG_INV="$(tr cC Cc <<< ${ARG})"
shift
COLUMNS="$1"
shift
EXPR="$@"

finish() {
rm -f $OTHER_COLUMNS
}
trap finish EXIT

if [ -z "$TMPDIR" ]; then
TMPDIR=/tmp
fi
OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX)

tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS)
2 changes: 2 additions & 0 deletions scripts/csv2arff
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
weka core.converters.CSVLoader /dev/stdin
134 changes: 134 additions & 0 deletions scripts/csv2vw
@@ -0,0 +1,134 @@
#!/usr/bin/env python3

import argparse
import csv
from sys import stdin, stdout, stderr, exit
import itertools


def main():
parser = argparse.ArgumentParser(
epilog="""If both --classes and --auto-relabel are omitted,
label values are left as-is. By default, features with value 0 are not
printed. This can be overridden with --null""",
usage="""%(prog)s [OPTION]... [FILE]
Convert CSV to Vowpal Wabbit input format.
Examples:
# Leave label values as is:
$ csv2vw spam.csv --label target
# Relabel values 'ham' to 0 and 'spam' to 1:
$ csv2vw spam.csv --label target --classes ham,spam
# Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss):
$ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one
# Relabel first label value to 0, second to 1, and ignore the rest:
$ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes
# Relabel first label value to 1, second to 2, and so on:
$ <iris.csv csv2vw -lspecies --multiclass --auto-relabel
# Relabel 'versicolor' to 1, 'virginica' to 2, and 'setosa' to 3
$ <iris.csv csv2vw -lspecies --multiclass -cversicolor,virginica,setosa""")

parser.add_argument("file", nargs="?", type=argparse.FileType("r"),
default=stdin,
help="""Input CSV file. If omitted,
read from standard input.""",
metavar="FILE")
parser.add_argument("-d", "--delimiter",
help="""Delimiting character of the input CSV file
(default: ,).""",
default=",")
parser.add_argument("-l", "--label",
help="""Name of column that contains the class
labels.""")
parser.add_argument("-c", "--classes",
help="""Ordered, comma-separated list of possible
class labels to relabel. If not specifying all possible
class labels, use --auto-relabel.""",
nargs="?")
parser.add_argument("-n", "--null",
help="""Comma-separated list of null values (default:
'0').""",
nargs="?", default="0")
parser.add_argument("-a", "--auto-relabel",
help="""Automatically relabel class labels in the order
in which they appear in the CSV file.""",
action="store_true")
parser.add_argument("-m", "--multiclass",
help="""Indicates more than two classes; will start
counting at 1 instead of 0.""",
action="store_true")
parser.add_argument("-+", "--minus-plus-one",
help="""Instead of relabeling to integers, relabel to
'-1' and '+1'. Needed when using VW with logistic or
hinge loss.""", action="store_true")
parser.add_argument("-i", "--ignore-extra-classes",
help="""If there are more than two classes found, when
not using --multiclass, include the example with no
label instead of giving skipping it.""",
action="store_true")
parser.add_argument("-t", "--tag",
help="""Name of column that contains the tags.""")

args = parser.parse_args()

auto_relabel = args.auto_relabel
label_column = args.label
tag_column = args.tag
null_values = args.null.split(",")
multiclass = args.multiclass
minus_plus_one = args.minus_plus_one

if minus_plus_one:
new_classes = iter(["-1", "+1"])
elif multiclass:
new_classes = (str(i) for i in itertools.count(1))
elif args.classes or auto_relabel:
new_classes = iter(["0", "1"])
else:
new_classes = None

if args.classes:
old_classes = args.classes.split(",")
relabel = dict(zip(old_classes, new_classes))
else:
relabel = dict()

reader = csv.DictReader(args.file, delimiter=args.delimiter)
try:
for row in reader:
label = row.pop(label_column, "")
tag = row.pop(tag_column, "")

if auto_relabel or new_classes:
if auto_relabel:
if label not in relabel:
try:
relabel[label] = next(new_classes)
except StopIteration:
if args.ignore_extra_classes:
relabel[label] = ""
else:
stderr.write("Found too many different classes;"
" skipping example. Use "
"--multiclass or "
"--ignore-extra-classes.\n")
continue
label = relabel[label]

features = " ".join([k + ":" + v for k, v in sorted(row.items())
if v not in null_values])
line = label + " " + tag + "| " + features + "\n"
stdout.write(line)
stdout.flush()
except (IOError, KeyboardInterrupt, BrokenPipeError):
stderr.close()

if __name__ == "__main__":
exit(main())
2 changes: 2 additions & 0 deletions scripts/drake
@@ -0,0 +1,2 @@
#!/bin/bash
drip -cp ${DRAKEPATH}/drake.jar drake.core "$@"
17 changes: 17 additions & 0 deletions scripts/dseq
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# dseq: generate sequence of dates relative to today.
#
# Usage: dseq LAST
# or: dseq FIRST LAST
# or: dseq FIRST INCREMENT LAST
#
# Example usage:
# $ dseq 1 # tomorrow
# $ dseq 0 0 # today
# $ dseq 7 # next 7 days
# $ dseq -2 0 # day before yesterday till today
# $ dseq 1 7 365 # tomorrow and then every week for a year
#
# Author: Jeroen Janssens

seq -f "%g day" "$@" | date --file - +%F

0 comments on commit b0dc901

Please sign in to comment.