From b0dc9018b1c0d02103ba06c976cce8df9351dfd4 Mon Sep 17 00:00:00 2001 From: Madhu Akula Date: Tue, 6 Sep 2016 14:45:56 +0530 Subject: [PATCH] added the scripts from data-sciecne-toolbox --- scripts/Rio | 135 +++++++++++++++++++++++++++++++++++++++++++ scripts/Rio-mds | 2 + scripts/Rio-pca | 2 + scripts/Rio-scatter | 17 ++++++ scripts/arff2csv | 2 + scripts/body | 15 +++++ scripts/cols | 34 +++++++++++ scripts/csv2arff | 2 + scripts/csv2vw | 134 ++++++++++++++++++++++++++++++++++++++++++ scripts/drake | 2 + scripts/dseq | 17 ++++++ scripts/dumbplot | 83 ++++++++++++++++++++++++++ scripts/explain | 25 ++++++++ scripts/header | 92 +++++++++++++++++++++++++++++ scripts/pbc | 10 ++++ scripts/sample | 94 ++++++++++++++++++++++++++++++ scripts/scrape | 70 ++++++++++++++++++++++ scripts/servewd | 3 + scripts/unpack | 44 ++++++++++++++ scripts/weka | 65 +++++++++++++++++++++ scripts/weka-cluster | 11 ++++ 21 files changed, 859 insertions(+) create mode 100755 scripts/Rio create mode 100755 scripts/Rio-mds create mode 100755 scripts/Rio-pca create mode 100755 scripts/Rio-scatter create mode 100755 scripts/arff2csv create mode 100755 scripts/body create mode 100755 scripts/cols create mode 100755 scripts/csv2arff create mode 100755 scripts/csv2vw create mode 100755 scripts/drake create mode 100755 scripts/dseq create mode 100755 scripts/dumbplot create mode 100755 scripts/explain create mode 100755 scripts/header create mode 100755 scripts/pbc create mode 100755 scripts/sample create mode 100755 scripts/scrape create mode 100755 scripts/servewd create mode 100755 scripts/unpack create mode 100755 scripts/weka create mode 100755 scripts/weka-cluster diff --git a/scripts/Rio b/scripts/Rio new file mode 100755 index 0000000..badf4bd --- /dev/null +++ b/scripts/Rio @@ -0,0 +1,135 @@ +#!/bin/bash +# Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV or PNG on stdout +# +# Example usage: +# $ < seq 100 | Rio -nf sum (same as Rio -ne 'sum(df)') +# +# $ curl -s 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' > iris.csv +# $ < iris.csv Rio -e 'df$SepalLength^2' +# $ < iris.csv Rio -f summary +# $ < iris.csv Rio -se 'sqldf("select Name from df where df.SepalLength > 7")' +# $ < iris.csv Rio -ge 'g+geom_point(aes(x=SepalLength,y=SepalWidth,colour=Name))' > iris.png +# +# Dependency: R (with optionally the R packages ggplot2, dplyr, tidyr, and sqldf) +# +# Author: http://jeroenjanssens.com + +usage() { +cat << EOF +Rio: Load CSV from stdin into R as a data.frame, execute given commands, and get the output as CSV on stdout + +usage: Rio OPTIONS + +OPTIONS: + -d Delimiter + -e Commands to execute + -f Single command to execute on data.frame + -h Show this message + -g Import ggplot2 + -n CSV has no header + -r Import dplyr and tidyr + -s Import sqldf + -b Use same settings as used for book Data Science at the Command Line + -v Verbose + +EOF +} + +finish() { + rm -f $IN $OUT ${OUT%.png} ${ERR%.err} + + ## Removes error file if error file is empty. + if [[ ! -s $ERR ]]; then + rm -f $ERR + fi + + rm -f Rplots.pdf +} + +trap finish EXIT + +callR() { + Rscript --vanilla -e "options(scipen=999);df<-read.csv('${IN}',header=${HEADER},sep='${DELIMITER}',stringsAsFactors=F);${REQUIRES}${SCRIPT}last<-.Last.value;if(is.matrix(last)){last<-as.data.frame(last)};if(is.data.frame(last)){write.table(last,'${OUT}',sep=',',quote=T,qmethod='double',row.names=F,col.names=${HEADER});}else if(is.vector(last)){cat(last,sep='\\\n', file='${OUT}')}else if(exists('is.ggplot')&&is.ggplot(last)){ggsave('${OUT}',last,dpi=${RIO_DPI-72},units='cm',width=20,height=15);}else{sink('${OUT}');print(last);}" +} + +SCRIPT= +REQUIRES= +DELIMITER="," +HEADER="T" +VERBOSE=false + +# OSX `mktemp' requires a temp file template, but Linux `mktemp' has it as optional. +# This explicitly uses a template, which works for both. The $TMPDIR is in case +# it isn't set as an enviroment variable, assumes you have /tmp. +if [ -z "$TMPDIR" ]; then + TMPDIR=/tmp/ +fi +IN=$(mktemp ${TMPDIR}/Rio-XXXXXXXX) +OUT=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).png +ERR=$(mktemp ${TMPDIR}/Rio-XXXXXXXX).err + +while getopts "d:hgnprsve:f:b" OPTION +do + case $OPTION in + b) + RIO_DPI=300 + ;; + d) + DELIMITER=$OPTARG + ;; + e) + SCRIPT=$OPTARG + if ! echo $SCRIPT | grep -qe "; *$" + then + SCRIPT="${SCRIPT};" + fi + ;; + f) + SCRIPT="${OPTARG}(df);" + ;; + h) + usage + exit 1 + ;; + g) + REQUIRES="${REQUIRES}require(ggplot2);g<-ggplot(df);" + ;; + n) + HEADER="F" + ;; + r) + REQUIRES="${REQUIRES}require(dplyr);require(tidyr);" + ;; + s) + REQUIRES="${REQUIRES}require(sqldf);" + ;; + v) + VERBOSE=true + ;; + ?) + usage + exit + ;; + esac +done + +cat /dev/stdin > $IN + +if $VERBOSE +then + callR +else + callR > $ERR 2>&1 +fi + +if [[ ! -f $OUT ]]; then + cat $ERR +else + RESULT="$(cat $OUT)" + if [ "$RESULT" == "NULL" ]; then + cat $ERR + else + cat $OUT + fi +fi + diff --git a/scripts/Rio-mds b/scripts/Rio-mds new file mode 100755 index 0000000..6ab1a10 --- /dev/null +++ b/scripts/Rio-mds @@ -0,0 +1,2 @@ +#!/bin/bash +Rio -e 'n<-sapply(df,is.numeric);fit<-cmdscale(dist(df),eig=TRUE,k=2);points<-as.data.frame(fit$points);cbind(points,df[!n])' diff --git a/scripts/Rio-pca b/scripts/Rio-pca new file mode 100755 index 0000000..68f1c4d --- /dev/null +++ b/scripts/Rio-pca @@ -0,0 +1,2 @@ +#!/bin/bash +Rio -e 'n<-sapply(df,is.numeric);cbind(as.data.frame(prcomp(df[n],scale=T)$x),df[!n])' diff --git a/scripts/Rio-scatter b/scripts/Rio-scatter new file mode 100755 index 0000000..5ad40fc --- /dev/null +++ b/scripts/Rio-scatter @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# Rio-scatter: create scatter plot from CSV +# +# Default colour is 1 (blue) +# +# Example usage: +# curl 'https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv' | Rio-scatter SepalLength SepalWidth Name | display +# +# Dependency: Rio +# +# Author: Jeroen Janssens (http://jeroenjanssens.com) + +X="$1" +Y="$2" +COLOR="${3:-1}" +Rio -ge "g+geom_point(aes(x=${X},y=${Y},color=${COLOR}))" diff --git a/scripts/arff2csv b/scripts/arff2csv new file mode 100755 index 0000000..32f00e6 --- /dev/null +++ b/scripts/arff2csv @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +weka core.converters.CSVSaver -i /dev/stdin diff --git a/scripts/body b/scripts/body new file mode 100755 index 0000000..7b106a9 --- /dev/null +++ b/scripts/body @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# +# body: apply expression to all but the first line. +# Use multiple times in case the header spans more than one line. +# +# Example usage: +# $ seq 10 | header -a 'values' | body sort -nr +# $ seq 10 | header -a 'multi\nline\nheader' | body body body sort -nr +# +# From: http://unix.stackexchange.com/a/11859 +# +# See also: header (https://github.com/jeroenjanssens/command-line-tools-for-data-science) +IFS= read -r header +printf '%s\n' "$header" +eval $@ diff --git a/scripts/cols b/scripts/cols new file mode 100755 index 0000000..6f68153 --- /dev/null +++ b/scripts/cols @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# cols: apply a command to a subset of the columns and merge back with the remaining columns. +# +# Assumes that the input data is comma-delimited and that it has a header. +# Depends on csvcut, which is part of csvkit: http://csvkit.readthedocs.org +# +# Example usage 1: reverse sort column 'a' +# $ echo 'a,b\n1,2\n3,4\n5,6' | cols -c a body sort -nr +# +# Example usage 2: apply PCA (using tapkee) to all numerical features (-C selects all but the specified columns) of the Iris data set: +# $ < iris.csv cols -C species body tapkee --method pca | header -r x,y,species +# +# See also: header and body (https://github.com/jeroenjanssens/command-line-tools-for-data-science) +# +# Author: http://jeroenjanssens.com + +ARG="$1" +ARG_INV="$(tr cC Cc <<< ${ARG})" +shift +COLUMNS="$1" +shift +EXPR="$@" + +finish() { + rm -f $OTHER_COLUMNS +} +trap finish EXIT + +if [ -z "$TMPDIR" ]; then + TMPDIR=/tmp +fi +OTHER_COLUMNS=$(mktemp ${TMPDIR}/cols-XXXXXXXX) + +tee $OTHER_COLUMNS | csvcut $ARG "$COLUMNS" | eval ${EXPR} | paste -d, - <(csvcut ${ARG_INV} "$COLUMNS" $OTHER_COLUMNS) diff --git a/scripts/csv2arff b/scripts/csv2arff new file mode 100755 index 0000000..ac3f365 --- /dev/null +++ b/scripts/csv2arff @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +weka core.converters.CSVLoader /dev/stdin diff --git a/scripts/csv2vw b/scripts/csv2vw new file mode 100755 index 0000000..e929b76 --- /dev/null +++ b/scripts/csv2vw @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import argparse +import csv +from sys import stdin, stdout, stderr, exit +import itertools + + +def main(): + parser = argparse.ArgumentParser( + epilog="""If both --classes and --auto-relabel are omitted, + label values are left as-is. By default, features with value 0 are not + printed. This can be overridden with --null""", + usage="""%(prog)s [OPTION]... [FILE] + +Convert CSV to Vowpal Wabbit input format. + + Examples: + + # Leave label values as is: + $ csv2vw spam.csv --label target + + # Relabel values 'ham' to 0 and 'spam' to 1: + $ csv2vw spam.csv --label target --classes ham,spam + + # Relabel values 'ham' to -1 and 'spam' to +1 (needed for logistic loss): + $ csv2vw spam.csv --label target --classes ham,spam --minus-plus-one + + # Relabel first label value to 0, second to 1, and ignore the rest: + $ csv2vw iris.csv -lspecies --auto-relabel --ignore-extra-classes + + # Relabel first label value to 1, second to 2, and so on: + $ a, pre' | sed -re 's/<(\/?)[^>]*>//g' +elif [[ "$SYSTEM" == "Darwin" ]] +then + curl -s "${URL}" | scrape -e 'span.dropdown > a, pre' | sed -Ee 's/<(\/?)[^>]*>//g' +fi + + + + diff --git a/scripts/header b/scripts/header new file mode 100755 index 0000000..add268e --- /dev/null +++ b/scripts/header @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# header: add, replace, and delete header lines. +# +# Example usage: +# $ seq 10 | header -a 'values' +# $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"' +# $ seq 10 | header -a 'values' | header -d +# $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_" +# +# See also: body (https://github.com/jeroenjanssens/command-line-tools-for-data-science) +# +# Author: http://jeroenjanssens.com + +usage () { +cat << EOF +header: add, replace, and delete header lines. + +usage: header OPTIONS + +OPTIONS: + -n Number of lines to consider as header [default: 1] + -a Add header + -r Replace header + -e Apply expression to header + -d Delete header + -h Show this message + +Example usage: + $ seq 10 | header -a 'values' + $ seq 10 | header -a 'VALUES' | header -e 'tr "[:upper:]" "[:lower:]"' + $ seq 10 | header -a 'values' | header -d + $ seq 10 | header -a 'multi\nline' | header -n 2 -e "paste -sd_" + +See also: body +EOF +} + +get_header () { + for i in $(seq $NUMROWS); do + IFS= read -r LINE + OLDHEADER="${OLDHEADER}${LINE}\n" + done +} + +print_header () { + echo -ne "$1" +} + +print_body () { + cat +} + +OLDHEADER= +NUMROWS=1 + +while getopts "dn:ha:r:e:" OPTION +do + case $OPTION in + n) + NUMROWS=$OPTARG + ;; + a) + print_header "$OPTARG\n" + print_body + exit 1 + ;; + d) + get_header + print_body + exit 1 + ;; + r) + get_header + print_header "$OPTARG\n" + print_body + exit 1 + ;; + e) + get_header + print_header "$(echo -ne $OLDHEADER | eval $OPTARG)\n" + print_body + exit 1 + ;; + h) + usage + exit 1 + ;; + esac +done + +get_header +print_header "${OLDHEADER}" diff --git a/scripts/pbc b/scripts/pbc new file mode 100755 index 0000000..6aaee21 --- /dev/null +++ b/scripts/pbc @@ -0,0 +1,10 @@ +#!/bin/bash +# pbc: parallel bc. First column of input CSV is mapped to {1}, second to {2}, and so forth. +# +# Example usage: paste -d, <(seq 100) <(seq 100 -1 1) | ./pbc 'sqrt({1}*{2})' +# +# Dependency: GNU parallel +# +# Author: http://jeroenjanssens.com + +parallel -C, -k -j100% "echo '$1' | bc -l" diff --git a/scripts/sample b/scripts/sample new file mode 100755 index 0000000..50c72c3 --- /dev/null +++ b/scripts/sample @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# sample: Output lines from stdin to stdout with a given probability, +# for a given duration, and with a given delay between lines. +# +# Example usage: seq 100 | sample -r 20% -d 1000 +# +# Dependency: Python 2.5 +# +# Author: http://jeroenjanssens.com + +import os +import argparse + +from random import random +from time import time, sleep +from sys import stdin, stdout +from datetime import datetime, timedelta + +def total_seconds(delta): + return delta.seconds + (24 * 3600 * delta.days) + +def main(): + parser = argparse.ArgumentParser(description=( + "Output lines from stdin to stdout with a given probability " + "for a given duration, and with a given delay between lines.")) + parser.add_argument('file', nargs='?', type=argparse.FileType('rb'), + default=stdin, help="File", metavar="FILE") + parser.add_argument('-W', '--weeks', type=float, default=0, + help="Duration of sampling in weeks") + parser.add_argument('-D', '--days', type=float, default=0, + help="Duration of sampling in days") + parser.add_argument('-H', '--hours', type=float, default=0, + help="Duration of sampling in hours") + parser.add_argument('-m', '--minutes', type=float, default=0, + help="Duration of sampling in minutes") + parser.add_argument('-s', '--seconds', type=float, default=0, + help="Duration of sampling in seconds") + parser.add_argument('-t', '--milliseconds', type=float, default=0, + help="Duration of sampling in milliseconds") + parser.add_argument('-u', '--microseconds', type=float, default=0, + help="Duration of sampling in microseconds") + parser.add_argument('-r', '--rate', default='100%', + help="Rate between 0 and 1 using either 0.33, 33%%, 1/3 notation.") + parser.add_argument('-d', '--delay', default=0, type=int, + help="Time in milliseconds between each line of output") + args = parser.parse_args() + + invalid_rate_msg = ("Invalid rate. Please specify a rate between 0" + " and 1 using either 0.33, 33%, 1/3 notation.") + + try: + delay = float(args.delay) / 1000.0 + except ValueError: + parser.error("Invalid delay. Please specify a delay in ms.") + + try: + duration = total_seconds(timedelta(weeks=args.weeks, days=args.days, + hours=args.hours, minutes=args.minutes, seconds=args.seconds, + milliseconds=args.milliseconds, microseconds=args.microseconds)) + except: + parser.error("Invalid duration.") + + try: + if '%' in args.rate: + rate = float(args.rate[:-1]) / 100.0 + elif '/' in args.rate: + a, b = map(float, args.rate.split('/')[:2]) + rate = a / (1.0*b) + else: + rate = float(args.rate) + except ValueError: + parser.error(invalid_rate_msg) + + if rate <= 0 or rate > 1: + parser.error(invalid_rate_msg) + + start = time() + try: + while True: + line = args.file.readline() + if not line: + return + if random() <= rate: + stdout.write(line) + stdout.flush() + now = time() + if duration and (now-start) > duration: + return + sleep(delay) + except: + pass + +if __name__ == "__main__": + exit(main()) diff --git a/scripts/scrape b/scripts/scrape new file mode 100755 index 0000000..60de259 --- /dev/null +++ b/scripts/scrape @@ -0,0 +1,70 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +# scrape: Extract HTML elements using an XPath query or CSS3 selector. +# +# Example usage: +# $ curl 'http://en.wikipedia.org/wiki/List_of_sovereign_states' -s \ +# | scrape -be 'table.wikitable > tr > td > b > a' +# +# Dependencies: lxml and optionally cssselector +# +# Author: http://jeroenjanssens.com + +import sys +import argparse +from lxml import etree + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('html', nargs='?', type=argparse.FileType('rb'), + default=sys.stdin, help="HTML", metavar="HTML") + parser.add_argument('-a', '--argument', default="", + help="argument to extract from tag") + parser.add_argument('-b', '--body', action='store_true', default=False, + help="Enclose output with HTML and BODY tags") + parser.add_argument('-e', '--expression', default='*', + help="XPath query or CSS3 selector") + parser.add_argument('-r', '--rawinput', action='store_true', default=False, + help="Do not parse HTML before feeding etree (useful" + "for escaping CData)") + args = parser.parse_args() + + args.expression = args.expression.decode('utf-8') + + if not args.expression.startswith('//'): + from cssselect import GenericTranslator, SelectorError + try: + expression = GenericTranslator().css_to_xpath(args.expression) + except SelectorError: + parser.error('Invalid CSS selector') + else: + expression = args.expression + + html_parser = etree.HTMLParser(encoding='utf-8', recover=True, + strip_cdata=True) + if args.rawinput: + document = etree.fromstring(args.html.read()) + else: + document = etree.parse(args.html, html_parser) + + if args.body: + sys.stdout.write("\n\n\n") + + for e in document.xpath(expression): + try: + if not args.argument: + text = etree.tostring(e) + else: + text = e.get(args.argument) + if text is not None: + sys.stdout.write(text.encode('utf-8') + "\n") + sys.stdout.flush() + except IOError: + pass + + if args.body: + sys.stdout.write("\n\n") + +if __name__ == "__main__": + exit(main()) diff --git a/scripts/servewd b/scripts/servewd new file mode 100755 index 0000000..4e7e324 --- /dev/null +++ b/scripts/servewd @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +ARGS="$@" +python -m SimpleHTTPServer ${ARGS} diff --git a/scripts/unpack b/scripts/unpack new file mode 100755 index 0000000..20b8b40 --- /dev/null +++ b/scripts/unpack @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# unpack: Extract common file formats + +# Dependencies: unrar, unzip, p7zip-full + +# Author: Patrick Brisbin +# From: http://linuxtidbits.wordpress.com/2009/08/04/week-of-bash-scripts-extract/ + +# Display usage if no parameters given +if [[ -z "$@" ]]; then + echo " ${0##*/} - extract common file formats)" + exit +fi + +# Required program(s) +req_progs=(7z unrar unzip) +for p in ${req_progs[@]}; do + hash "$p" 2>&- || \ + { echo >&2 " Required program \"$p\" not installed."; exit 1; } +done + +# Test if file exists +if [ ! -f "$@" ]; then + echo "File "$@" doesn't exist" + exit +fi + +# Extract file by using extension as reference +case "$@" in + *.7z ) 7z x "$@" ;; + *.tar.bz2 ) tar xvjf "$@" ;; + *.bz2 ) bunzip2 "$@" ;; + *.deb ) ar vx "$@" ;; + *.tar.gz ) tar xvf "$@" ;; + *.gz ) gunzip "$@" ;; + *.tar ) tar xvf "$@" ;; + *.tbz2 ) tar xvjf "$@" ;; + *.tar.xz ) tar xvf "$@" ;; + *.tgz ) tar xvzf "$@" ;; + *.rar ) unrar x "$@" ;; + *.zip ) unzip "$@" ;; + *.Z ) uncompress "$@" ;; + * ) echo " Unsupported file format" ;; +esac diff --git a/scripts/weka b/scripts/weka new file mode 100755 index 0000000..1f31d35 --- /dev/null +++ b/scripts/weka @@ -0,0 +1,65 @@ +#!/bin/bash +# weka: run Weka from the command-line +# +# Weka can be obtained from http://www.cs.waikato.ac.nz/ml/weka/downloading.html +# Make sure that WEKAPATH is set to the full path that contains weka.jar in your .bashrc or .zshrc +# The snippets below enable tab completion in Bash and Zsh, respectively. +# +# Author: Jeroen Janssens (http://jeroenjanssens.com) +# +# See csv2arff and arff2csv for two examples + +java -Xmx1024M -cp ${WEKAPATH}/weka.jar "weka.$@" + +######################################################### +# Tab completion for Bash # +######################################################### +# +# export WEKAPATH="/home/joe/bin/" +# +# weka-classes () { +# unzip -l $WEKAPATH/weka.jar | +# sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' | +# tr '/' '.' +# } +# +# weka-folders () { +# unzip -l $WEKAPATH/weka.jar | +# sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' | +# tr '/' '.' +# } +# +# _completeweka() { +# local curw=${COMP_WORDS[COMP_CWORD]} +# local wordlist=$(weka-folders; weka-classes) +# COMPREPLY=($(compgen -W '${wordlist[@]}' -- "$curw")) +# return 0 +# } +# +# complete -o nospace -F _completeweka weka +# +######################################################### +# Tab completion for Zsh # +######################################################### +# +# export WEKAJAR="/home/joe/bin/weka.jar" +# +# weka-classes () { +# unzip -l $WEKAJAR | +# sed -rne 's/.*(weka)\/([^g])([^$]*)\.class$/\2\3/p' | +# tr '/' '.' +# } +# +# weka-folders () { +# unzip -l $WEKAJAR | +# sed -rne 's/.*(weka)\/([^g])([^$]*)\/$/\2\3\./p' | +# tr '/' '.' +# } +# +# function _completeweka { +# reply=($(weka-folders; weka-classes)) +# } +# +# compctl -K _completeweka weka +# +######################################################### diff --git a/scripts/weka-cluster b/scripts/weka-cluster new file mode 100755 index 0000000..a056e98 --- /dev/null +++ b/scripts/weka-cluster @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +ALGO="$@" +IN=$(mktemp --tmpdir weka-cluster-XXXXXXXX).arff + +finish () { + rm -f $IN +} +trap finish EXIT + +csv2arff > $IN +weka filters.unsupervised.attribute.AddCluster -W "weka.${ALGO}" -i $IN -o /dev/stdout | arff2csv