Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added the scripts from data-sciecne-toolbox
- Loading branch information
1 parent
1d916f7
commit b0dc901
Showing
21 changed files
with
859 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#!/bin/bash | ||
# Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout | ||
# | ||
# Example usage: | ||
# $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)') | ||
# | ||
# $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv | ||
# $ < iris.csv Rio -e 'df$SepalLength^2' | ||
# $ < iris.csv Rio -f summary | ||
# $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")' | ||
# $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png | ||
# | ||
# Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, and sqldf) | ||
# | ||
# Author: http://jeroenjanssens.com | ||
|
||
usage() { | ||
cat << EOF | ||
Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout | ||
usage: Rio OPTIONS | ||
OPTIONS: | ||
-d Delimiter | ||
-e Commands to execute | ||
-f Single command to execute on data.frame | ||
-h Show this message | ||
-g Import ggplot2 | ||
-n CSV has no header | ||
-r Import dplyr and tidyr | ||
-s Import sqldf | ||
-b Use same settings as used for book Data Science at the Command Line | ||
-v Verbose | ||
EOF | ||
} | ||
|
||
finish() { | ||
rm -f $IN $OUT ${OUT%.png} ${ERR%.err} | ||
|
||
## Removes error file if error file is empty. | ||
if [[ ! -s $ERR ]]; then | ||
rm -f $ERR | ||
fi | ||
|
||
rm -f Rplots.pdf | ||
} | ||
|
||
trap finish EXIT | ||
|
||
callR() { | ||
Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}" | ||
} | ||
|
||
SCRIPT= | ||
REQUIRES= | ||
DELIMITER="," | ||
HEADER="T" | ||
VERBOSE=false | ||
|
||
# OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional. | ||
# This explicitly uses a template, which works for both. The $TMPDIR is in case | ||
# it isn't set as an enviroment variable, assumes you have /tmp. | ||
if [ -z "$TMPDIR" ]; then | ||
TMPDIR=/tmp/ | ||
fi | ||
IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX) | ||
OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png | ||
ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err | ||
|
||
while getopts "d:hgnprsve:f:b" OPTION | ||
do | ||
case $OPTION in | ||
b) | ||
RIO_DPI=300 | ||
;; | ||
d) | ||
DELIMITER=$OPTARG | ||
;; | ||
e) | ||
SCRIPT=$OPTARG | ||
if ! echo $SCRIPT | grep -qe "; *$" | ||
then | ||
SCRIPT="${SCRIPT};" | ||
fi | ||
;; | ||
f) | ||
SCRIPT="${OPTARG}(df);" | ||
;; | ||
h) | ||
usage | ||
exit 1 | ||
;; | ||
g) | ||
REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);" | ||
;; | ||
n) | ||
HEADER="F" | ||
;; | ||
r) | ||
REQUIRES="${REQUIRES}require(dplyr);require(tidyr);" | ||
;; | ||
s) | ||
REQUIRES="${REQUIRES}require(sqldf);" | ||
;; | ||
v) | ||
VERBOSE=true | ||
;; | ||
?) | ||
usage | ||
exit | ||
;; | ||
esac | ||
done | ||
|
||
cat /dev/stdin > $IN | ||
|
||
if $VERBOSE | ||
then | ||
callR | ||
else | ||
callR > $ERR 2>&1 | ||
fi | ||
|
||
if [[ ! -f $OUT ]]; then | ||
cat $ERR | ||
else | ||
RESULT="$(cat $OUT)" | ||
if [ "$RESULT" == "NULL" ]; then | ||
cat $ERR | ||
else | ||
cat $OUT | ||
fi | ||
fi | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash | ||
Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash | ||
Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Rio-scatter: create scatter plot from CSV | ||
# | ||
# Default colour is 1 (blue) | ||
# | ||
# Example usage: | ||
# curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display | ||
# | ||
# Dependency: Rio | ||
# | ||
# Author: Jeroen Janssens (http://jeroenjanssens.com) | ||
|
||
X="$1" | ||
Y="$2" | ||
COLOR="${3:-1}" | ||
Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/usr/bin/env bash | ||
weka core.converters.CSVSaver -i /dev/stdin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# body: apply expression to all but the first line. | ||
# Use multiple times in case the header spans more than one line. | ||
# | ||
# Example usage: | ||
# $ seq 10 | header -a 'values' | body sort -nr | ||
# $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr | ||
# | ||
# From: http://unix.stackexchange.com/a/11859 | ||
# | ||
# See also: header (https://github.com/jeroenjanssens/command-line-tools-for-data-science) | ||
IFS= read -r header | ||
printf '%s\n' "$header" | ||
eval $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/usr/bin/env bash | ||
# cols: apply a command to a subset of the columns and merge back with the remaining columns. | ||
# | ||
# Assumes that the input data is comma-delimited and that it has a header. | ||
# Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org | ||
# | ||
# Example usage 1: reverse sort column 'a' | ||
# $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr | ||
# | ||
# Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set: | ||
# $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species | ||
# | ||
# See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science) | ||
# | ||
# Author: http://jeroenjanssens.com | ||
|
||
ARG="$1" | ||
ARG_INV="$(tr cC Cc <<< ${ARG})" | ||
shift | ||
COLUMNS="$1" | ||
shift | ||
EXPR="$@" | ||
|
||
finish() { | ||
rm -f $OTHER_COLUMNS | ||
} | ||
trap finish EXIT | ||
|
||
if [ -z "$TMPDIR" ]; then | ||
TMPDIR=/tmp | ||
fi | ||
OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX) | ||
|
||
tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/usr/bin/env bash | ||
weka core.converters.CSVLoader /dev/stdin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import csv | ||
from sys import stdin, stdout, stderr, exit | ||
import itertools | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
epilog="""If both --classes and --auto-relabel are omitted, | ||
label values are left as-is. By default, features with value 0 are not | ||
printed. This can be overridden with --null""", | ||
usage="""%(prog)s [OPTION]... [FILE] | ||
Convert CSV to Vowpal Wabbit input format. | ||
Examples: | ||
# Leave label values as is: | ||
$ csv2vw spam.csv --label target | ||
# Relabel values 'ham' to 0 and 'spam' to 1: | ||
$ csv2vw spam.csv --label target --classes ham,spam | ||
# Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss): | ||
$ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one | ||
# Relabel first label value to 0, second to 1, and ignore the rest: | ||
$ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes | ||
# Relabel first label value to 1, second to 2, and so on: | ||
$ <iris.csv csv2vw -lspecies --multiclass --auto-relabel | ||
# Relabel 'versicolor' to 1, 'virginica' to 2, and 'setosa' to 3 | ||
$ <iris.csv csv2vw -lspecies --multiclass -cversicolor,virginica,setosa""") | ||
|
||
parser.add_argument("file", nargs="?", type=argparse.FileType("r"), | ||
default=stdin, | ||
help="""Input CSV file. If omitted, | ||
read from standard input.""", | ||
metavar="FILE") | ||
parser.add_argument("-d", "--delimiter", | ||
help="""Delimiting character of the input CSV file | ||
(default: ,).""", | ||
default=",") | ||
parser.add_argument("-l", "--label", | ||
help="""Name of column that contains the class | ||
labels.""") | ||
parser.add_argument("-c", "--classes", | ||
help="""Ordered, comma-separated list of possible | ||
class labels to relabel. If not specifying all possible | ||
class labels, use --auto-relabel.""", | ||
nargs="?") | ||
parser.add_argument("-n", "--null", | ||
help="""Comma-separated list of null values (default: | ||
'0').""", | ||
nargs="?", default="0") | ||
parser.add_argument("-a", "--auto-relabel", | ||
help="""Automatically relabel class labels in the order | ||
in which they appear in the CSV file.""", | ||
action="store_true") | ||
parser.add_argument("-m", "--multiclass", | ||
help="""Indicates more than two classes; will start | ||
counting at 1 instead of 0.""", | ||
action="store_true") | ||
parser.add_argument("-+", "--minus-plus-one", | ||
help="""Instead of relabeling to integers, relabel to | ||
'-1' and '+1'. Needed when using VW with logistic or | ||
hinge loss.""", action="store_true") | ||
parser.add_argument("-i", "--ignore-extra-classes", | ||
help="""If there are more than two classes found, when | ||
not using --multiclass, include the example with no | ||
label instead of giving skipping it.""", | ||
action="store_true") | ||
parser.add_argument("-t", "--tag", | ||
help="""Name of column that contains the tags.""") | ||
|
||
args = parser.parse_args() | ||
|
||
auto_relabel = args.auto_relabel | ||
label_column = args.label | ||
tag_column = args.tag | ||
null_values = args.null.split(",") | ||
multiclass = args.multiclass | ||
minus_plus_one = args.minus_plus_one | ||
|
||
if minus_plus_one: | ||
new_classes = iter(["-1", "+1"]) | ||
elif multiclass: | ||
new_classes = (str(i) for i in itertools.count(1)) | ||
elif args.classes or auto_relabel: | ||
new_classes = iter(["0", "1"]) | ||
else: | ||
new_classes = None | ||
|
||
if args.classes: | ||
old_classes = args.classes.split(",") | ||
relabel = dict(zip(old_classes, new_classes)) | ||
else: | ||
relabel = dict() | ||
|
||
reader = csv.DictReader(args.file, delimiter=args.delimiter) | ||
try: | ||
for row in reader: | ||
label = row.pop(label_column, "") | ||
tag = row.pop(tag_column, "") | ||
|
||
if auto_relabel or new_classes: | ||
if auto_relabel: | ||
if label not in relabel: | ||
try: | ||
relabel[label] = next(new_classes) | ||
except StopIteration: | ||
if args.ignore_extra_classes: | ||
relabel[label] = "" | ||
else: | ||
stderr.write("Found too many different classes;" | ||
" skipping example. Use " | ||
"--multiclass or " | ||
"--ignore-extra-classes.\n") | ||
continue | ||
label = relabel[label] | ||
|
||
features = " ".join([k + ":" + v for k, v in sorted(row.items()) | ||
if v not in null_values]) | ||
line = label + " " + tag + "| " + features + "\n" | ||
stdout.write(line) | ||
stdout.flush() | ||
except (IOError, KeyboardInterrupt, BrokenPipeError): | ||
stderr.close() | ||
|
||
if __name__ == "__main__": | ||
exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
#!/bin/bash | ||
drip -cp ${DRAKEPATH}/drake.jar drake.core "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bash | ||
# dseq: generate sequence of dates relative to today. | ||
# | ||
# Usage: dseq LAST | ||
# or: dseq FIRST LAST | ||
# or: dseq FIRST INCREMENT LAST | ||
# | ||
# Example usage: | ||
# $ dseq 1 # tomorrow | ||
# $ dseq 0 0 # today | ||
# $ dseq 7 # next 7 days | ||
# $ dseq -2 0 # day before yesterday till today | ||
# $ dseq 1 7 365 # tomorrow and then every week for a year | ||
# | ||
# Author: Jeroen Janssens | ||
|
||
seq -f "%g day" "$@" | date --file - +%F |
Oops, something went wrong.