diff --git a/common/Makefile.common b/common/Makefile.common
index 1c73f0f..4edd2e3 100644
--- a/common/Makefile.common
+++ b/common/Makefile.common
@@ -37,11 +37,13 @@ TODAY := $(shell date "+%Y/%m/%d;%H:%M")
 
 # Compile commands
 
+OMPSSC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(OMPSSC_FLAGS) $(APP_FLAGS)
 OMPC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(OMPC_FLAGS) $(APP_FLAGS)
 CC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(CC_FLAGS) $(APP_FLAGS) 
 
 # Link commands
 
+OMPSSLINK_ALL_FLAGS=$(OPT_FLAGS) $(OMPSSLINK_FLAGS) $(APP_FLAGS)
 OMPLINK_ALL_FLAGS=$(OPT_FLAGS) $(OMPLINK_FLAGS) $(APP_FLAGS)
 CLINK_ALL_FLAGS=$(OPT_FLAGS) $(CC_FLAGS) $(APP_FLAGS)
 
@@ -216,6 +218,137 @@ endif
 
 endif
 
+#ifeq ( $(shell [[ $(VERSION) == ompss && $(OMPSSC) == mcc  ]] && echo true ), true )
+ifeq ($(VERSION),ompss)
+ifeq ($(ENABLE_OMPSS),yes)
+
+TARGETS = $(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)
+
+ifdef CUTOFF_VERSIONS
+	TARGETS += $(CUTOFF_VERSIONS:%=$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-%)
+
+	MANUAL_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-manual.o)
+	IF_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-if.o)
+	FINAL_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-final.o)
+
+	MANUAL_FLAGS = -DMANUAL_CUTOFF
+	IF_FLAGS = -DIF_CUTOFF
+	FINAL_FLAGS = -DFINAL_CUTOFF $(OMPSSC_FINAL_FLAGS)
+endif
+
+ifdef TIED_VERSIONS
+	TIED_TARGETS := $(TARGETS:%=%-tied)
+	TARGETS += $(TIED_TARGETS)
+
+	TIED_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-tied.o)
+	TIED_MANUAL_PROGRAM_OBJS := $(MANUAL_PROGRAM_OBJS:%.o=%-tied.o)
+	TIED_IF_PROGRAM_OBJS := $(IF_PROGRAM_OBJS:%.o=%-tied.o)
+	TIED_FINAL_PROGRAM_OBJS := $(FINAL_PROGRAM_OBJS:%.o=%-tied.o)
+
+	TIED_FLAGS = -DFORCE_TIED_TASKS
+endif
+
+
+all: $(TARGETS)
+
+.c.o: Makefile $(COMMON_DIR)/Makefile.common 
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) -o $@ $<
+
+%-if.o: %.c Makefile $(COMMON_DIR)/Makefile.common 
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) -o $@ $<
+
+%-final.o: %.c Makefile $(COMMON_DIR)/Makefile.common 
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) -o $@ $<
+
+%-manual.o: %.c Makefile $(COMMON_DIR)/Makefile.common 
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -o $@ $<
+
+# we remove the untied clause with sed.
+# For this to work it must be the first clause of the task directive
+# Ugly... but there's no easy solutions because it is a pragma
+
+%-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common
+	cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -o $@ tied-$<;\
+	rm tied-$<
+
+%-if-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common
+	cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) $(TIED_FLAGS) -o $@ $<;\
+	rm tied-$<
+
+%-manual-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common
+	cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) $(TIED_FLAGS) -o $@ $<;\
+	rm tied-$<
+
+%-final-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common
+	cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) $(TIED_FLAGS) -o $@ $<;\
+	rm tied-$<
+
+INFO_FLAGS_OMPSS=-DCDATE="\"$(TODAY)\"" -DCC="\"$(OMPSSC)\"" -DLD="\"$(OMPSSLINK)\"" -DCMESSAGE="\"$(CMESSAGE)\"" \
+               -DLDFLAGS="\"$(OMPSSLINK_ALL_FLAGS) $(LIBS)\""
+
+main.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) -I.\""
+
+main-if.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(IF_FLAGS)-I.\""
+
+main-final.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS)-I.\""
+
+main-manual.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -I.\""
+
+main-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -I.\""
+
+main-if-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(IF_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(IF_FLAGS) -I.\""
+
+main-manual-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(MANUAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(MANUAL_FLAGS) -I.\""
+
+main-final-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common
+	$(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(FINAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(FINAL_FLAGS) -I.\""
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION): main.o $(PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main.o $(PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-manual: main-manual.o $(MANUAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-manual.o $(MANUAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-if_clause: main-if.o $(IF_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) 
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if.o $(IF_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-tied: main-tied.o $(TIED_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-tied.o $(TIED_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-manual-tied: main-manual-tied.o $(TIED_MANUAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-manual-tied.o $(TIED_MANUAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-if_clause-tied: main-if-tied.o $(TIED_IF_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if-tied.o $(TIED_IF_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+ifdef USE_FINAL_CLAUSE
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final: main-final.o $(FINAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) 
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-final.o $(FINAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final-tied: main-final-tied.o $(TIED_FINAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS)
+	$(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if-tied.o $(TIED_FINAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS)
+
+else
+
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final: 
+$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final-tied: 
+
+endif
+
+endif
+endif
 
 clean:
 	rm -fr *.o
diff --git a/common/ompss-app.h b/common/ompss-app.h
new file mode 100644
index 0000000..3d2190b
--- /dev/null
+++ b/common/ompss-app.h
@@ -0,0 +1,31 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <omp.h>
+
+#define MODEL OMPSS
+
+#ifdef FORCE_TIED_TASKS
+#define BOTS_MODEL_DESC "OmpSs (using tied tasks)"
+#else
+#define BOTS_MODEL_DESC "OmpSs (using tasks)"
+#endif
+
+
diff --git a/configure b/configure
index 2e5445d..31efa43 100755
--- a/configure
+++ b/configure
@@ -13,21 +13,20 @@ show_help ()
 }
 
 while [ "$#" -gt 0  ]; do
-	case $1 in
-		--debug) debug=yes
-				;;
-		--warnings) warnings=yes
-				;;
-		--compiler) shift; COMPILER=$1
-				;;
-		--help) 
-			show_help
-			exit
-                        ;;
-		*) echo "Unknown option $1 (skipping)"
-			;;
-	esac
-	shift
+   case $1 in
+      --debug) debug=yes
+         ;;
+      --warnings) warnings=yes
+         ;;
+      --compiler) shift; COMPILER=$1
+         ;;
+      --help) 
+         show_help
+         exit
+      *) echo "Unknown option $1 (skipping)"
+         ;;
+   esac
+   shift
 done
 
 GCC=`gcc -x c -c -fopenmp /dev/null -o /dev/null &>/dev/null  && echo "yes"`
@@ -42,49 +41,48 @@ nc=0
 if [ -z "$COMPILER" ]; then
 
    if [ "$GCC" = "yes" ]; then
-	let nc=nc+1
-	COMPILERS[nc]="gcc"
+      let nc=nc+1
+      COMPILERS[nc]="gcc"
    fi
 
    if [ "$MCC" = "yes" ]; then
-	let nc=nc+1
-	COMPILERS[nc]="mcc"
+      let nc=nc+1
+      COMPILERS[nc]="mcc"
    fi
 
    if [ "$ICC" = "yes" ]; then
-	let nc=nc+1
-	COMPILERS[nc]="icc"
+      let nc=nc+1
+      COMPILERS[nc]="icc"
    fi
 
    if [ "$XLC" = "yes" ]; then
-	let nc=nc+1
-	COMPILERS[nc]="xlc"
+      let nc=nc+1
+      COMPILERS[nc]="xlc"
    fi
 
    if [ "$PGI" = "yes" ]; then
-        let nc=nc+1
-        COMPILERS[nc]="pgi"
+      let nc=nc+1
+      COMPILERS[nc]="pgi"
    fi
 
    if [ "$SUN" = "yes" ]; then
-        let nc=nc+1        
-        COMPILERS[nc]="sunstudio"
+      let nc=nc+1        
+      COMPILERS[nc]="sunstudio"
    fi     
 
    if [ "$nc" -gt "0" ]; then
-	echo "The following compilers are recognized: "
-	n=1
-	for comp in ${COMPILERS[*]}; do
- 	 	echo "  $n. $comp"
-  		let n=n+1
-	done
-	echo -n "Choose one to use:"
-	read 
-
-	COMPILER=${COMPILERS[$REPLY]}
+      echo "The following compilers are recognized: "
+      n=1
+      for comp in ${COMPILERS[*]}; do
+         echo "  $n. $comp"
+         let n=n+1
+      done
+      echo -n "Choose one to use:"
+      read 
+      COMPILER=${COMPILERS[$REPLY]}
    else
-	echo "No suitable compiler was detected"
-	echo "An empty $OUTPUT will be generated"
+      echo "No suitable compiler was detected"
+      echo "An empty $OUTPUT will be generated"
    fi
 fi
 
@@ -93,125 +91,132 @@ fi
 [ "$warnings" = "yes" ] && WARNINGS=
 
 if [ "$COMPILER" = "gcc" -a "$GCC" = "yes" ]; then
-	CC=gcc
-	CLINK=$CC
-	OMPC="$CC -fopenmp"
-	OMPLINK="$CC -fopenmp"
-	LABEL=gcc
-	[ "$debug" = "yes" ] && DEBUG=$DEBUG -g
-	[ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
-
-	OPT_FLAGS=-O3
-	CC_FLAGS="$DEBUG $WARNINGS"
-	OMPC_FLAGS="$DEBUG $WARNINGS"
-	CLINK_FLAGS=$DEBUG
-	OMPLINK_FLAGS=$DEBUG
+   CC=gcc
+   CLINK=$CC
+   OMPC="$CC -fopenmp"
+   OMPLINK="$CC -fopenmp"
+   LABEL=gcc
+   [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
+   [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
+
+   OPT_FLAGS=-O3
+   CC_FLAGS="$DEBUG $WARNINGS"
+   OMPC_FLAGS="$DEBUG $WARNINGS"
+   CLINK_FLAGS=$DEBUG
+   OMPLINK_FLAGS=$DEBUG
 fi
 
 if [ "$COMPILER" = "icc" -a "$ICC" = "yes" ]; then
-	CC=icc
-	CLINK=$CC
-	OMPC="$CC -openmp"
-	OMPLINK="$CC -openmp"
-	LABEL=icc
-	[ "$debug" = "yes" ] && DEBUG=$DEBUG -g
-	[ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
-
-	OPT_FLAGS=-O2
-	CC_FLAGS="$DEBUG $WARNINGS"
-	OMPC_FLAGS="$DEBUG $WARNINGS"
-	CLINK_FLAGS=$DEBUG
-	OMPLINK_FLAGS=$DEBUG
+   CC=icc
+   CLINK=$CC
+   OMPC="$CC -openmp"
+   OMPLINK="$CC -openmp"
+   LABEL=icc
+   [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
+   [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
+
+   OPT_FLAGS=-O2
+   CC_FLAGS="$DEBUG $WARNINGS"
+   OMPC_FLAGS="$DEBUG $WARNINGS"
+   CLINK_FLAGS=$DEBUG
+   OMPLINK_FLAGS=$DEBUG
 fi
 
 
 if [ "$COMPILER" = "mcc" -a "$MCC" = "yes" ]; then
-	case $(uname -i) in
-		x86_64) extras="-m32"
-                        ;;
-        esac
-
-	CC="mcc $extras"
-	CLINK=$CC
-	OMPC=$CC
-	OMPLINK=$CC
-	LABEL=mcc
-	[ "$debug" = "yes" ] && DEBUG=$DEBUG -g
-	[ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
-
-	OPT_FLAGS=-O3
-	CC_FLAGS="$DEBUG $WARNINGS --no-openmp"
-	OMPC_FLAGS="$DEBUG $WARNINGS"
-	CLINK_FLAGS="$DEBUG --no-openmp"
-	OMPLINK_FLAGS=$DEBUG
-        OMPC_FINAL_FLAGS="--serialize"
-
-	supports_final_clause=yes
+   case $(uname -i) in
+      x86_64) extras="-m32"
+      ;;
+   esac
+
+   CC="mcc $extras"
+   CLINK=$CC
+   OMPC=$CC
+   OMPLINK=$CC
+   LABEL=mcc
+   [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
+   [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror"
+
+   OPT_FLAGS=-O3
+   CC_FLAGS="$DEBUG $WARNINGS --no-openmp"
+   OMPC_FLAGS="$DEBUG $WARNINGS"
+   CLINK_FLAGS="$DEBUG --no-openmp"
+   OMPLINK_FLAGS=$DEBUG
+   OMPC_FINAL_FLAGS="--serialize"
+
+   ENABLE_OMPSS=yes
+
+   OMPSSC=mcc
+   OMPSSLINK=mcc
+   OMPSSC_FLAGS=--ompss
+   OMPSSLINK_FLAGS=--ompss
+
+   supports_final_clause=yes
 fi
 
 if [ "$COMPILER" = "xlc" -a "$XLC" = "yes" ]; then
-	CC=xlc_r
-	CLINK=$CC
-	OMPC="$CC -qsmp=omp"
-	OMPLINK=$OMPC
-	LABEL=xlc
-	[ "$debug" = "yes" ] && DEBUG=$DEBUG -g
-	[ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -qflag=w:w -qhalt=w"
-
-	OPT_FLAGS=-O3
-	CC_FLAGS="$DEBUG $WARNINGS"
-	OMPC_FLAGS="-qthreaded $DEBUG $WARNINGS"
-	CLINK_FLAGS=$DEBUG
-	OMPLINK_FLAGS="-qthreaded $DEBUG"
+   CC=xlc_r
+   CLINK=$CC
+   OMPC="$CC -qsmp=omp"
+   OMPLINK=$OMPC
+   LABEL=xlc
+   [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
+   [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -qflag=w:w -qhalt=w"
+
+   OPT_FLAGS=-O3
+   CC_FLAGS="$DEBUG $WARNINGS"
+   OMPC_FLAGS="-qthreaded $DEBUG $WARNINGS"
+   CLINK_FLAGS=$DEBUG
+   OMPLINK_FLAGS="-qthreaded $DEBUG"
 fi
 
 if [ "$COMPILER" = "pgi" -a "$PGI" = "yes" ]; then
-        CC=pgcc
-        CLINK=$CC
-        OMPC="$CC -mp -Minfo=mp"
-        OMPLINK=$OMPC
-	LABEL=pgi
-        [ "$debug" = "yes" ] && DEBUG="$DEBUG -g"
-	if [ "$warnings" = "yes" ]; then
-		echo "The pgi compiler doesn't support '--warnings' option (skipping it)"
-		echo "Press (Enter) to continue..."
-		read 
-	fi
-
-        OPT_FLAGS=-fast
-        CC_FLAGS="$DEBUG $WARNINGS"
-        OMPC_FLAGS="$DEBUG $WARNINGS"
-        CLINK_FLAGS=$DEBUG
-        OMPLINK_FLAGS=$DEBUG
+   CC=pgcc
+   CLINK=$CC
+   OMPC="$CC -mp -Minfo=mp"
+   OMPLINK=$OMPC
+   LABEL=pgi
+   [ "$debug" = "yes" ] && DEBUG="$DEBUG -g"
+   if [ "$warnings" = "yes" ]; then
+      echo "The pgi compiler doesn't support '--warnings' option (skipping it)"
+      echo "Press (Enter) to continue..."
+      read 
+   fi
+
+   OPT_FLAGS=-fast
+   CC_FLAGS="$DEBUG $WARNINGS"
+   OMPC_FLAGS="$DEBUG $WARNINGS"
+   CLINK_FLAGS=$DEBUG
+   OMPLINK_FLAGS=$DEBUG
 fi
 
 if [ "$COMPILER" = "sunstudio" -a "$SUN" = "yes" ]; then
-        CC=cc                         
-        CLINK=$CC                     
-        OMPC="$CC -xopenmp"           
-        OMPLINK=$OMPC                 
-	LABEL=suns
-        [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
-	if [ "$warnings" = "yes" ]; then
-		echo "The sunstudio compiler doesn't support '--warnings' option (skipping it)"
-		echo "Press (Enter) to continue..."
-		read 
-	fi
-
-        OPT_FLAGS=-fast
-        CC_FLAGS="$DEBUG $WARNINGS"
-        OMPC_FLAGS="$DEBUG $WARNINGS"
-        CLINK_FLAGS=$DEBUG
-        OMPLINK_FLAGS=$DEBUG
+   CC=cc                         
+   CLINK=$CC                     
+   OMPC="$CC -xopenmp"           
+   OMPLINK=$OMPC                 
+   LABEL=suns
+   [ "$debug" = "yes" ] && DEBUG=$DEBUG -g
+   if [ "$warnings" = "yes" ]; then
+      echo "The sunstudio compiler doesn't support '--warnings' option (skipping it)"
+      echo "Press (Enter) to continue..."
+      read 
+   fi
+
+   OPT_FLAGS=-fast
+   CC_FLAGS="$DEBUG $WARNINGS"
+   OMPC_FLAGS="$DEBUG $WARNINGS"
+   CLINK_FLAGS=$DEBUG
+   OMPLINK_FLAGS=$DEBUG
 fi
 
 if [ -z "$CC" ]; then
-  echo "Wrong compiler configuration"
-  exit 1
+   echo "Wrong compiler configuration"
+   exit 1
 fi
 
 if [ "$debug" = "yes" ]; then
-	LABEL="$LABEL-debug"
+   LABEL="$LABEL-debug"
 fi
 
 [ -f $OUTPUT ] && replacing_config=true
@@ -223,9 +228,13 @@ cat > $OUTPUT << EOF
 #config name
 LABEL=$LABEL
 
+ENABLE_OMPSS=$ENABLE_OMPSS
+
 #compilers
+OMPSSC=$OMPSSC
 OMPC=$OMPC
 CC=$CC
+OMPSSLINK=$OMPSSLINK
 OMPLINK=$OMPLINK
 CLINK=$CLINK
 
@@ -235,10 +244,13 @@ OPT_FLAGS=$OPT_FLAGS
 
 CC_FLAGS=$CC_FLAGS
 OMPC_FLAGS=$OMPC_FLAGS
+OMPSSC_FLAGS=$OMPSSC_FLAGS
 OMPC_FINAL_FLAGS=$OMPC_FINAL_FLAGS
+OMPSSC_FINAL_FLAG=$OMPSSC_FINAL_FLAGS
 
 CLINK_FLAGS=$CLINK_FLAGS
 OMPLINK_FLAGS=$OMPLINK_FLAGS
+OMPSSLINK_FLAGS=$OMPSSLINK_FLAGS
 
 EOF
 
@@ -248,18 +260,18 @@ EOF
 echo "make.config generated"
 
 if [ "$replacing_config" ]; then
-	echo "Configuration was changed. Cleaning up"
-	make clean
+   echo "Configuration was changed. Cleaning up"
+   make clean
 fi 
 
 [ -d bin ] || mkdir -p bin
 
 if make -v | grep GNU &> /dev/null; then
-	echo "Run make to compile the benchmarks"
+   echo "Run make to compile the benchmarks"
 elif gmake -v | grep GNU &> /dev/null; then 
-	echo "Run gmake to compile the benchmarks"
+   echo "Run gmake to compile the benchmarks"
 else
-	echo "I didn't find a GNU-compatible make. You'll need it to compile the benchmarks"
+   echo "I didn't find a GNU-compatible make. You'll need it to compile the benchmarks"
 fi
 
 echo "You can further refine your configuration in config/make.config"
diff --git a/ompss/Makefile b/ompss/Makefile
new file mode 100644
index 0000000..cffe95c
--- /dev/null
+++ b/ompss/Makefile
@@ -0,0 +1,40 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#DIRS=fib alignment nqueens sort strassen sparselu fft floorplan health uts
+DIRS=fib
+
+RECURSIVE=all-recursive clean-recursive dist-clean-recursive
+
+all: all-recursive
+clean: clean-recursive
+dist-clean: dist-clean-recursive
+
+$(RECURSIVE): 
+	@failcom='exit 1';\
+        target=`echo $@ | sed s/-recursive//`; \
+        for subdir in $(DIRS); do \
+                (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$target) \
+          || eval $$failcom; \
+        done;
+
+dist: dist-clean
+	echo "TODO"
+
diff --git a/ompss/Makefile.version b/ompss/Makefile.version
new file mode 100644
index 0000000..c06a272
--- /dev/null
+++ b/ompss/Makefile.version
@@ -0,0 +1,21 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+VERSION=ompss
diff --git a/ompss/fft/Makefile b/ompss/fft/Makefile
new file mode 100644
index 0000000..c19750a
--- /dev/null
+++ b/ompss/fft/Makefile
@@ -0,0 +1,35 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+LIBS = -lm
+#PROGRAM_OBJS=
+
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/fft/app-desc.h b/ompss/fft/app-desc.h
new file mode 100644
index 0000000..071c202
--- /dev/null
+++ b/ompss/fft/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+#include "fft.h"
+
+#define BOTS_APP_NAME "FFT"
+#define BOTS_APP_PARAMETERS_DESC "Size=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 32*1024*1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_INIT int i;\
+     COMPLEX *in, *out1=NULL, *out2=NULL;\
+     in = malloc(bots_arg_size * sizeof(COMPLEX));\
+
+#define KERNEL_INIT\
+     out1 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_CALL fft(bots_arg_size, in, out1);
+#define KERNEL_FINI 
+
+#define KERNEL_SEQ_INIT\
+     out2 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_SEQ_CALL fft_seq(bots_arg_size, in, out2);
+#define KERNEL_SEQ_FINI
+
+#define BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK test_correctness(bots_arg_size, out1, out2)
+
diff --git a/ompss/fft/fft.c b/ompss/fft/fft.c
new file mode 100644
index 0000000..341d5ef
--- /dev/null
+++ b/ompss/fft/fft.c
@@ -0,0 +1,4854 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/* 
+ * Original code from the Cilk project 
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+/* Definitions and operations for complex numbers */
+
+/*
+ * compute the W coefficients (that is, powers of the root of 1)
+ * and store them into an array.
+ */
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  compute_w_coefficients(n, a, ab, W);
+          #pragma omp task untied
+	  compute_w_coefficients(n, ab + 1, b, W);
+          #pragma omp taskwait
+     }
+}
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  compute_w_coefficients_seq(n, a, ab, W);
+	  compute_w_coefficients_seq(n, ab + 1, b, W);
+     }
+}
+/*
+ * Determine (in a stupid way) if n is divisible by eight, then by four, else
+ * find the smallest prime factor of n.
+ */
+int factor(int n)
+{
+     int r;
+
+     if (n < 2) return 1;
+     if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 || n == 4096) return 8;
+     if ((n & 15) == 0) return 16;
+     if ((n & 7) == 0) return 8;
+     if ((n & 3) == 0) return 4;
+     if ((n & 1) == 0) return 2;
+
+     /* try odd numbers up to n (computing the sqrt may be slower) */
+     for (r = 3; r < n; r += 2) if (n % r == 0) return r;
+
+     /* n is prime */
+     return n;
+}
+
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  unshuffle(a, ab, in, out, r, m);
+          #pragma omp task untied
+	  unshuffle(ab, b, in, out, r, m);
+          #pragma omp taskwait
+     }
+}
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  unshuffle_seq(a, ab, in, out, r, m);
+	  unshuffle_seq(ab, b, in, out, r, m);
+     }
+}
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out,
+				  COMPLEX * W, int r, int m,
+				  int nW, int nWdnti, int nWdntm)
+{
+     int j, k;
+     COMPLEX *jp, *kp;
+
+     for (k = 0, kp = out; k < r; ++k, kp += m) {
+	  REAL r0, i0, rt, it, rw, iw;
+	  int l1 = nWdnti + nWdntm * k;
+	  int l0;
+
+	  r0 = i0 = 0.0;
+	  for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) {
+	       rw = c_re(W[l0]);
+	       iw = c_im(W[l0]);
+	       rt = c_re(*jp);
+	       it = c_im(*jp);
+	       r0 += rt * rw - it * iw;
+	       i0 += rt * iw + it * rw;
+	       l0 += l1;
+	       if (l0 > nW)
+		    l0 -= nW;
+	  }
+	  c_re(*kp) = r0;
+	  c_im(*kp) = i0;
+     }
+}
+
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m)
+{
+     if (i == i1 - 1) {
+          #pragma omp task untied
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+          #pragma omp task untied
+	  fft_twiddle_gen(i, i2, in, out, W, nW,
+				nWdn, r, m);
+          #pragma omp task untied
+	  fft_twiddle_gen(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+     #pragma omp taskwait
+}
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W,
+                         int nW, int nWdn, int r, int m)
+{
+     if (i == i1 - 1) {
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+	  fft_twiddle_gen_seq(i, i2, in, out, W, nW,
+				nWdn, r, m);
+	  fft_twiddle_gen_seq(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+}
+/* machine-generated code begins here */
+void fft_base_2(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     r1_0 = c_re(in[0]);
+     i1_0 = c_im(in[0]);
+     r1_1 = c_re(in[1]);
+     i1_1 = c_im(in[1]);
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[1]) = (r1_0 - r1_1);
+     c_im(out[1]) = (i1_0 - i1_1);
+}
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_2_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_2_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_unshuffle_2(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_2(ab, b, in, out, m);
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_2_seq(a, ab, in, out, m);
+	  fft_unshuffle_2_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_4(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     REAL r1_2, i1_2;
+     REAL r1_3, i1_3;
+     {
+	  REAL r2_0, i2_0;
+	  REAL r2_2, i2_2;
+	  r2_0 = c_re(in[0]);
+	  i2_0 = c_im(in[0]);
+	  r2_2 = c_re(in[2]);
+	  i2_2 = c_im(in[2]);
+	  r1_0 = (r2_0 + r2_2);
+	  i1_0 = (i2_0 + i2_2);
+	  r1_2 = (r2_0 - r2_2);
+	  i1_2 = (i2_0 - i2_2);
+     }
+     {
+	  REAL r2_1, i2_1;
+	  REAL r2_3, i2_3;
+	  r2_1 = c_re(in[1]);
+	  i2_1 = c_im(in[1]);
+	  r2_3 = c_re(in[3]);
+	  i2_3 = c_im(in[3]);
+	  r1_1 = (r2_1 + r2_3);
+	  i1_1 = (i2_1 + i2_3);
+	  r1_3 = (r2_1 - r2_3);
+	  i1_3 = (i2_1 - i2_3);
+     }
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[2]) = (r1_0 - r1_1);
+     c_im(out[2]) = (i1_0 - i1_1);
+     c_re(out[1]) = (r1_2 + i1_3);
+     c_im(out[1]) = (i1_2 - r1_3);
+     c_re(out[3]) = (r1_2 - i1_3);
+     c_im(out[3]) = (i1_2 + r1_3);
+}
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_4_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_4_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_unshuffle_4(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_4(ab, b, in, out, m);
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_4_seq(a, ab, in, out, m);
+	  fft_unshuffle_4_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_8(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    r3_0 = c_re(in[0]);
+		    i3_0 = c_im(in[0]);
+		    r3_4 = c_re(in[4]);
+		    i3_4 = c_im(in[4]);
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_4 = (r3_0 - r3_4);
+		    i2_4 = (i3_0 - i3_4);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    r3_2 = c_re(in[2]);
+		    i3_2 = c_im(in[2]);
+		    r3_6 = c_re(in[6]);
+		    i3_6 = c_im(in[6]);
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_6 = (r3_2 - r3_6);
+		    i2_6 = (i3_2 - i3_6);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_4 = (r2_0 - r2_2);
+	       i1_4 = (i2_0 - i2_2);
+	       r1_2 = (r2_4 + i2_6);
+	       i1_2 = (i2_4 - r2_6);
+	       r1_6 = (r2_4 - i2_6);
+	       i1_6 = (i2_4 + r2_6);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    r3_1 = c_re(in[1]);
+		    i3_1 = c_im(in[1]);
+		    r3_5 = c_re(in[5]);
+		    i3_5 = c_im(in[5]);
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_5 = (r3_1 - r3_5);
+		    i2_5 = (i3_1 - i3_5);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    r3_3 = c_re(in[3]);
+		    i3_3 = c_im(in[3]);
+		    r3_7 = c_re(in[7]);
+		    i3_7 = c_im(in[7]);
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_7 = (r3_3 - r3_7);
+		    i2_7 = (i3_3 - i3_7);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_5 = (r2_1 - r2_3);
+	       i1_5 = (i2_1 - i2_3);
+	       r1_3 = (r2_5 + i2_7);
+	       i1_3 = (i2_5 - r2_7);
+	       r1_7 = (r2_5 - i2_7);
+	       i1_7 = (i2_5 + r2_7);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[4]) = (r1_0 - r1_1);
+	  c_im(out[4]) = (i1_0 - i1_1);
+	  tmpr = (0.707106781187 * (r1_3 + i1_3));
+	  tmpi = (0.707106781187 * (i1_3 - r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[5]) = (r1_2 - tmpr);
+	  c_im(out[5]) = (i1_2 - tmpi);
+	  c_re(out[2]) = (r1_4 + i1_5);
+	  c_im(out[2]) = (i1_4 - r1_5);
+	  c_re(out[6]) = (r1_4 - i1_5);
+	  c_im(out[6]) = (i1_4 + r1_5);
+	  tmpr = (0.707106781187 * (i1_7 - r1_7));
+	  tmpi = (0.707106781187 * (r1_7 + i1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 - tmpi);
+	  c_re(out[7]) = (r1_6 - tmpr);
+	  c_im(out[7]) = (i1_6 + tmpi);
+     }
+}
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_8_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_8_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_unshuffle_8(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_8(ab, b, in, out, m);
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_8_seq(a, ab, in, out, m);
+	  fft_unshuffle_8_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_16(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 r4_0 = c_re(in[0]);
+			 i4_0 = c_im(in[0]);
+			 r4_8 = c_re(in[8]);
+			 i4_8 = c_im(in[8]);
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_8 = (r4_0 - r4_8);
+			 i3_8 = (i4_0 - i4_8);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 r4_4 = c_re(in[4]);
+			 i4_4 = c_im(in[4]);
+			 r4_12 = c_re(in[12]);
+			 i4_12 = c_im(in[12]);
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_12 = (r4_4 - r4_12);
+			 i3_12 = (i4_4 - i4_12);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_8 = (r3_0 - r3_4);
+		    i2_8 = (i3_0 - i3_4);
+		    r2_4 = (r3_8 + i3_12);
+		    i2_4 = (i3_8 - r3_12);
+		    r2_12 = (r3_8 - i3_12);
+		    i2_12 = (i3_8 + r3_12);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 r4_2 = c_re(in[2]);
+			 i4_2 = c_im(in[2]);
+			 r4_10 = c_re(in[10]);
+			 i4_10 = c_im(in[10]);
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_10 = (r4_2 - r4_10);
+			 i3_10 = (i4_2 - i4_10);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 r4_6 = c_re(in[6]);
+			 i4_6 = c_im(in[6]);
+			 r4_14 = c_re(in[14]);
+			 i4_14 = c_im(in[14]);
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_14 = (r4_6 - r4_14);
+			 i3_14 = (i4_6 - i4_14);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_10 = (r3_2 - r3_6);
+		    i2_10 = (i3_2 - i3_6);
+		    r2_6 = (r3_10 + i3_14);
+		    i2_6 = (i3_10 - r3_14);
+		    r2_14 = (r3_10 - i3_14);
+		    i2_14 = (i3_10 + r3_14);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_8 = (r2_0 - r2_2);
+	       i1_8 = (i2_0 - i2_2);
+	       tmpr = (0.707106781187 * (r2_6 + i2_6));
+	       tmpi = (0.707106781187 * (i2_6 - r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_10 = (r2_4 - tmpr);
+	       i1_10 = (i2_4 - tmpi);
+	       r1_4 = (r2_8 + i2_10);
+	       i1_4 = (i2_8 - r2_10);
+	       r1_12 = (r2_8 - i2_10);
+	       i1_12 = (i2_8 + r2_10);
+	       tmpr = (0.707106781187 * (i2_14 - r2_14));
+	       tmpi = (0.707106781187 * (r2_14 + i2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 - tmpi);
+	       r1_14 = (r2_12 - tmpr);
+	       i1_14 = (i2_12 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 r4_1 = c_re(in[1]);
+			 i4_1 = c_im(in[1]);
+			 r4_9 = c_re(in[9]);
+			 i4_9 = c_im(in[9]);
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_9 = (r4_1 - r4_9);
+			 i3_9 = (i4_1 - i4_9);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 r4_5 = c_re(in[5]);
+			 i4_5 = c_im(in[5]);
+			 r4_13 = c_re(in[13]);
+			 i4_13 = c_im(in[13]);
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_13 = (r4_5 - r4_13);
+			 i3_13 = (i4_5 - i4_13);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_9 = (r3_1 - r3_5);
+		    i2_9 = (i3_1 - i3_5);
+		    r2_5 = (r3_9 + i3_13);
+		    i2_5 = (i3_9 - r3_13);
+		    r2_13 = (r3_9 - i3_13);
+		    i2_13 = (i3_9 + r3_13);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 r4_3 = c_re(in[3]);
+			 i4_3 = c_im(in[3]);
+			 r4_11 = c_re(in[11]);
+			 i4_11 = c_im(in[11]);
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_11 = (r4_3 - r4_11);
+			 i3_11 = (i4_3 - i4_11);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 r4_7 = c_re(in[7]);
+			 i4_7 = c_im(in[7]);
+			 r4_15 = c_re(in[15]);
+			 i4_15 = c_im(in[15]);
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_15 = (r4_7 - r4_15);
+			 i3_15 = (i4_7 - i4_15);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_11 = (r3_3 - r3_7);
+		    i2_11 = (i3_3 - i3_7);
+		    r2_7 = (r3_11 + i3_15);
+		    i2_7 = (i3_11 - r3_15);
+		    r2_15 = (r3_11 - i3_15);
+		    i2_15 = (i3_11 + r3_15);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_9 = (r2_1 - r2_3);
+	       i1_9 = (i2_1 - i2_3);
+	       tmpr = (0.707106781187 * (r2_7 + i2_7));
+	       tmpi = (0.707106781187 * (i2_7 - r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_11 = (r2_5 - tmpr);
+	       i1_11 = (i2_5 - tmpi);
+	       r1_5 = (r2_9 + i2_11);
+	       i1_5 = (i2_9 - r2_11);
+	       r1_13 = (r2_9 - i2_11);
+	       i1_13 = (i2_9 + r2_11);
+	       tmpr = (0.707106781187 * (i2_15 - r2_15));
+	       tmpi = (0.707106781187 * (r2_15 + i2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 - tmpi);
+	       r1_15 = (r2_13 - tmpr);
+	       i1_15 = (i2_13 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[8]) = (r1_0 - r1_1);
+	  c_im(out[8]) = (i1_0 - i1_1);
+	  tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+	  tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[9]) = (r1_2 - tmpr);
+	  c_im(out[9]) = (i1_2 - tmpi);
+	  tmpr = (0.707106781187 * (r1_5 + i1_5));
+	  tmpi = (0.707106781187 * (i1_5 - r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[10]) = (r1_4 - tmpr);
+	  c_im(out[10]) = (i1_4 - tmpi);
+	  tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+	  tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[11]) = (r1_6 - tmpr);
+	  c_im(out[11]) = (i1_6 - tmpi);
+	  c_re(out[4]) = (r1_8 + i1_9);
+	  c_im(out[4]) = (i1_8 - r1_9);
+	  c_re(out[12]) = (r1_8 - i1_9);
+	  c_im(out[12]) = (i1_8 + r1_9);
+	  tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+	  tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 - tmpi);
+	  c_re(out[13]) = (r1_10 - tmpr);
+	  c_im(out[13]) = (i1_10 + tmpi);
+	  tmpr = (0.707106781187 * (i1_13 - r1_13));
+	  tmpi = (0.707106781187 * (r1_13 + i1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 - tmpi);
+	  c_re(out[14]) = (r1_12 - tmpr);
+	  c_im(out[14]) = (i1_12 + tmpi);
+	  tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+	  tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 - tmpi);
+	  c_re(out[15]) = (r1_14 - tmpr);
+	  c_im(out[15]) = (i1_14 + tmpi);
+     }
+}
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_16_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_16_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_unshuffle_16(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_16(ab, b, in, out, m);
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_16_seq(a, ab, in, out, m);
+	  fft_unshuffle_16_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_32(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  REAL r1_16, i1_16;
+	  REAL r1_17, i1_17;
+	  REAL r1_18, i1_18;
+	  REAL r1_19, i1_19;
+	  REAL r1_20, i1_20;
+	  REAL r1_21, i1_21;
+	  REAL r1_22, i1_22;
+	  REAL r1_23, i1_23;
+	  REAL r1_24, i1_24;
+	  REAL r1_25, i1_25;
+	  REAL r1_26, i1_26;
+	  REAL r1_27, i1_27;
+	  REAL r1_28, i1_28;
+	  REAL r1_29, i1_29;
+	  REAL r1_30, i1_30;
+	  REAL r1_31, i1_31;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       REAL r2_16, i2_16;
+	       REAL r2_18, i2_18;
+	       REAL r2_20, i2_20;
+	       REAL r2_22, i2_22;
+	       REAL r2_24, i2_24;
+	       REAL r2_26, i2_26;
+	       REAL r2_28, i2_28;
+	       REAL r2_30, i2_30;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    REAL r3_16, i3_16;
+		    REAL r3_20, i3_20;
+		    REAL r3_24, i3_24;
+		    REAL r3_28, i3_28;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 REAL r4_16, i4_16;
+			 REAL r4_24, i4_24;
+			 {
+			      REAL r5_0, i5_0;
+			      REAL r5_16, i5_16;
+			      r5_0 = c_re(in[0]);
+			      i5_0 = c_im(in[0]);
+			      r5_16 = c_re(in[16]);
+			      i5_16 = c_im(in[16]);
+			      r4_0 = (r5_0 + r5_16);
+			      i4_0 = (i5_0 + i5_16);
+			      r4_16 = (r5_0 - r5_16);
+			      i4_16 = (i5_0 - i5_16);
+			 }
+			 {
+			      REAL r5_8, i5_8;
+			      REAL r5_24, i5_24;
+			      r5_8 = c_re(in[8]);
+			      i5_8 = c_im(in[8]);
+			      r5_24 = c_re(in[24]);
+			      i5_24 = c_im(in[24]);
+			      r4_8 = (r5_8 + r5_24);
+			      i4_8 = (i5_8 + i5_24);
+			      r4_24 = (r5_8 - r5_24);
+			      i4_24 = (i5_8 - i5_24);
+			 }
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_16 = (r4_0 - r4_8);
+			 i3_16 = (i4_0 - i4_8);
+			 r3_8 = (r4_16 + i4_24);
+			 i3_8 = (i4_16 - r4_24);
+			 r3_24 = (r4_16 - i4_24);
+			 i3_24 = (i4_16 + r4_24);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 REAL r4_20, i4_20;
+			 REAL r4_28, i4_28;
+			 {
+			      REAL r5_4, i5_4;
+			      REAL r5_20, i5_20;
+			      r5_4 = c_re(in[4]);
+			      i5_4 = c_im(in[4]);
+			      r5_20 = c_re(in[20]);
+			      i5_20 = c_im(in[20]);
+			      r4_4 = (r5_4 + r5_20);
+			      i4_4 = (i5_4 + i5_20);
+			      r4_20 = (r5_4 - r5_20);
+			      i4_20 = (i5_4 - i5_20);
+			 }
+			 {
+			      REAL r5_12, i5_12;
+			      REAL r5_28, i5_28;
+			      r5_12 = c_re(in[12]);
+			      i5_12 = c_im(in[12]);
+			      r5_28 = c_re(in[28]);
+			      i5_28 = c_im(in[28]);
+			      r4_12 = (r5_12 + r5_28);
+			      i4_12 = (i5_12 + i5_28);
+			      r4_28 = (r5_12 - r5_28);
+			      i4_28 = (i5_12 - i5_28);
+			 }
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_20 = (r4_4 - r4_12);
+			 i3_20 = (i4_4 - i4_12);
+			 r3_12 = (r4_20 + i4_28);
+			 i3_12 = (i4_20 - r4_28);
+			 r3_28 = (r4_20 - i4_28);
+			 i3_28 = (i4_20 + r4_28);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_16 = (r3_0 - r3_4);
+		    i2_16 = (i3_0 - i3_4);
+		    tmpr = (0.707106781187 * (r3_12 + i3_12));
+		    tmpi = (0.707106781187 * (i3_12 - r3_12));
+		    r2_4 = (r3_8 + tmpr);
+		    i2_4 = (i3_8 + tmpi);
+		    r2_20 = (r3_8 - tmpr);
+		    i2_20 = (i3_8 - tmpi);
+		    r2_8 = (r3_16 + i3_20);
+		    i2_8 = (i3_16 - r3_20);
+		    r2_24 = (r3_16 - i3_20);
+		    i2_24 = (i3_16 + r3_20);
+		    tmpr = (0.707106781187 * (i3_28 - r3_28));
+		    tmpi = (0.707106781187 * (r3_28 + i3_28));
+		    r2_12 = (r3_24 + tmpr);
+		    i2_12 = (i3_24 - tmpi);
+		    r2_28 = (r3_24 - tmpr);
+		    i2_28 = (i3_24 + tmpi);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    REAL r3_18, i3_18;
+		    REAL r3_22, i3_22;
+		    REAL r3_26, i3_26;
+		    REAL r3_30, i3_30;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 REAL r4_18, i4_18;
+			 REAL r4_26, i4_26;
+			 {
+			      REAL r5_2, i5_2;
+			      REAL r5_18, i5_18;
+			      r5_2 = c_re(in[2]);
+			      i5_2 = c_im(in[2]);
+			      r5_18 = c_re(in[18]);
+			      i5_18 = c_im(in[18]);
+			      r4_2 = (r5_2 + r5_18);
+			      i4_2 = (i5_2 + i5_18);
+			      r4_18 = (r5_2 - r5_18);
+			      i4_18 = (i5_2 - i5_18);
+			 }
+			 {
+			      REAL r5_10, i5_10;
+			      REAL r5_26, i5_26;
+			      r5_10 = c_re(in[10]);
+			      i5_10 = c_im(in[10]);
+			      r5_26 = c_re(in[26]);
+			      i5_26 = c_im(in[26]);
+			      r4_10 = (r5_10 + r5_26);
+			      i4_10 = (i5_10 + i5_26);
+			      r4_26 = (r5_10 - r5_26);
+			      i4_26 = (i5_10 - i5_26);
+			 }
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_18 = (r4_2 - r4_10);
+			 i3_18 = (i4_2 - i4_10);
+			 r3_10 = (r4_18 + i4_26);
+			 i3_10 = (i4_18 - r4_26);
+			 r3_26 = (r4_18 - i4_26);
+			 i3_26 = (i4_18 + r4_26);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 REAL r4_22, i4_22;
+			 REAL r4_30, i4_30;
+			 {
+			      REAL r5_6, i5_6;
+			      REAL r5_22, i5_22;
+			      r5_6 = c_re(in[6]);
+			      i5_6 = c_im(in[6]);
+			      r5_22 = c_re(in[22]);
+			      i5_22 = c_im(in[22]);
+			      r4_6 = (r5_6 + r5_22);
+			      i4_6 = (i5_6 + i5_22);
+			      r4_22 = (r5_6 - r5_22);
+			      i4_22 = (i5_6 - i5_22);
+			 }
+			 {
+			      REAL r5_14, i5_14;
+			      REAL r5_30, i5_30;
+			      r5_14 = c_re(in[14]);
+			      i5_14 = c_im(in[14]);
+			      r5_30 = c_re(in[30]);
+			      i5_30 = c_im(in[30]);
+			      r4_14 = (r5_14 + r5_30);
+			      i4_14 = (i5_14 + i5_30);
+			      r4_30 = (r5_14 - r5_30);
+			      i4_30 = (i5_14 - i5_30);
+			 }
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_22 = (r4_6 - r4_14);
+			 i3_22 = (i4_6 - i4_14);
+			 r3_14 = (r4_22 + i4_30);
+			 i3_14 = (i4_22 - r4_30);
+			 r3_30 = (r4_22 - i4_30);
+			 i3_30 = (i4_22 + r4_30);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_18 = (r3_2 - r3_6);
+		    i2_18 = (i3_2 - i3_6);
+		    tmpr = (0.707106781187 * (r3_14 + i3_14));
+		    tmpi = (0.707106781187 * (i3_14 - r3_14));
+		    r2_6 = (r3_10 + tmpr);
+		    i2_6 = (i3_10 + tmpi);
+		    r2_22 = (r3_10 - tmpr);
+		    i2_22 = (i3_10 - tmpi);
+		    r2_10 = (r3_18 + i3_22);
+		    i2_10 = (i3_18 - r3_22);
+		    r2_26 = (r3_18 - i3_22);
+		    i2_26 = (i3_18 + r3_22);
+		    tmpr = (0.707106781187 * (i3_30 - r3_30));
+		    tmpi = (0.707106781187 * (r3_30 + i3_30));
+		    r2_14 = (r3_26 + tmpr);
+		    i2_14 = (i3_26 - tmpi);
+		    r2_30 = (r3_26 - tmpr);
+		    i2_30 = (i3_26 + tmpi);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_16 = (r2_0 - r2_2);
+	       i1_16 = (i2_0 - i2_2);
+	       tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+	       tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_18 = (r2_4 - tmpr);
+	       i1_18 = (i2_4 - tmpi);
+	       tmpr = (0.707106781187 * (r2_10 + i2_10));
+	       tmpi = (0.707106781187 * (i2_10 - r2_10));
+	       r1_4 = (r2_8 + tmpr);
+	       i1_4 = (i2_8 + tmpi);
+	       r1_20 = (r2_8 - tmpr);
+	       i1_20 = (i2_8 - tmpi);
+	       tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+	       tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 + tmpi);
+	       r1_22 = (r2_12 - tmpr);
+	       i1_22 = (i2_12 - tmpi);
+	       r1_8 = (r2_16 + i2_18);
+	       i1_8 = (i2_16 - r2_18);
+	       r1_24 = (r2_16 - i2_18);
+	       i1_24 = (i2_16 + r2_18);
+	       tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+	       tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+	       r1_10 = (r2_20 + tmpr);
+	       i1_10 = (i2_20 - tmpi);
+	       r1_26 = (r2_20 - tmpr);
+	       i1_26 = (i2_20 + tmpi);
+	       tmpr = (0.707106781187 * (i2_26 - r2_26));
+	       tmpi = (0.707106781187 * (r2_26 + i2_26));
+	       r1_12 = (r2_24 + tmpr);
+	       i1_12 = (i2_24 - tmpi);
+	       r1_28 = (r2_24 - tmpr);
+	       i1_28 = (i2_24 + tmpi);
+	       tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+	       tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+	       r1_14 = (r2_28 + tmpr);
+	       i1_14 = (i2_28 - tmpi);
+	       r1_30 = (r2_28 - tmpr);
+	       i1_30 = (i2_28 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       REAL r2_17, i2_17;
+	       REAL r2_19, i2_19;
+	       REAL r2_21, i2_21;
+	       REAL r2_23, i2_23;
+	       REAL r2_25, i2_25;
+	       REAL r2_27, i2_27;
+	       REAL r2_29, i2_29;
+	       REAL r2_31, i2_31;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    REAL r3_17, i3_17;
+		    REAL r3_21, i3_21;
+		    REAL r3_25, i3_25;
+		    REAL r3_29, i3_29;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 REAL r4_17, i4_17;
+			 REAL r4_25, i4_25;
+			 {
+			      REAL r5_1, i5_1;
+			      REAL r5_17, i5_17;
+			      r5_1 = c_re(in[1]);
+			      i5_1 = c_im(in[1]);
+			      r5_17 = c_re(in[17]);
+			      i5_17 = c_im(in[17]);
+			      r4_1 = (r5_1 + r5_17);
+			      i4_1 = (i5_1 + i5_17);
+			      r4_17 = (r5_1 - r5_17);
+			      i4_17 = (i5_1 - i5_17);
+			 }
+			 {
+			      REAL r5_9, i5_9;
+			      REAL r5_25, i5_25;
+			      r5_9 = c_re(in[9]);
+			      i5_9 = c_im(in[9]);
+			      r5_25 = c_re(in[25]);
+			      i5_25 = c_im(in[25]);
+			      r4_9 = (r5_9 + r5_25);
+			      i4_9 = (i5_9 + i5_25);
+			      r4_25 = (r5_9 - r5_25);
+			      i4_25 = (i5_9 - i5_25);
+			 }
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_17 = (r4_1 - r4_9);
+			 i3_17 = (i4_1 - i4_9);
+			 r3_9 = (r4_17 + i4_25);
+			 i3_9 = (i4_17 - r4_25);
+			 r3_25 = (r4_17 - i4_25);
+			 i3_25 = (i4_17 + r4_25);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 REAL r4_21, i4_21;
+			 REAL r4_29, i4_29;
+			 {
+			      REAL r5_5, i5_5;
+			      REAL r5_21, i5_21;
+			      r5_5 = c_re(in[5]);
+			      i5_5 = c_im(in[5]);
+			      r5_21 = c_re(in[21]);
+			      i5_21 = c_im(in[21]);
+			      r4_5 = (r5_5 + r5_21);
+			      i4_5 = (i5_5 + i5_21);
+			      r4_21 = (r5_5 - r5_21);
+			      i4_21 = (i5_5 - i5_21);
+			 }
+			 {
+			      REAL r5_13, i5_13;
+			      REAL r5_29, i5_29;
+			      r5_13 = c_re(in[13]);
+			      i5_13 = c_im(in[13]);
+			      r5_29 = c_re(in[29]);
+			      i5_29 = c_im(in[29]);
+			      r4_13 = (r5_13 + r5_29);
+			      i4_13 = (i5_13 + i5_29);
+			      r4_29 = (r5_13 - r5_29);
+			      i4_29 = (i5_13 - i5_29);
+			 }
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_21 = (r4_5 - r4_13);
+			 i3_21 = (i4_5 - i4_13);
+			 r3_13 = (r4_21 + i4_29);
+			 i3_13 = (i4_21 - r4_29);
+			 r3_29 = (r4_21 - i4_29);
+			 i3_29 = (i4_21 + r4_29);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_17 = (r3_1 - r3_5);
+		    i2_17 = (i3_1 - i3_5);
+		    tmpr = (0.707106781187 * (r3_13 + i3_13));
+		    tmpi = (0.707106781187 * (i3_13 - r3_13));
+		    r2_5 = (r3_9 + tmpr);
+		    i2_5 = (i3_9 + tmpi);
+		    r2_21 = (r3_9 - tmpr);
+		    i2_21 = (i3_9 - tmpi);
+		    r2_9 = (r3_17 + i3_21);
+		    i2_9 = (i3_17 - r3_21);
+		    r2_25 = (r3_17 - i3_21);
+		    i2_25 = (i3_17 + r3_21);
+		    tmpr = (0.707106781187 * (i3_29 - r3_29));
+		    tmpi = (0.707106781187 * (r3_29 + i3_29));
+		    r2_13 = (r3_25 + tmpr);
+		    i2_13 = (i3_25 - tmpi);
+		    r2_29 = (r3_25 - tmpr);
+		    i2_29 = (i3_25 + tmpi);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    REAL r3_19, i3_19;
+		    REAL r3_23, i3_23;
+		    REAL r3_27, i3_27;
+		    REAL r3_31, i3_31;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 REAL r4_19, i4_19;
+			 REAL r4_27, i4_27;
+			 {
+			      REAL r5_3, i5_3;
+			      REAL r5_19, i5_19;
+			      r5_3 = c_re(in[3]);
+			      i5_3 = c_im(in[3]);
+			      r5_19 = c_re(in[19]);
+			      i5_19 = c_im(in[19]);
+			      r4_3 = (r5_3 + r5_19);
+			      i4_3 = (i5_3 + i5_19);
+			      r4_19 = (r5_3 - r5_19);
+			      i4_19 = (i5_3 - i5_19);
+			 }
+			 {
+			      REAL r5_11, i5_11;
+			      REAL r5_27, i5_27;
+			      r5_11 = c_re(in[11]);
+			      i5_11 = c_im(in[11]);
+			      r5_27 = c_re(in[27]);
+			      i5_27 = c_im(in[27]);
+			      r4_11 = (r5_11 + r5_27);
+			      i4_11 = (i5_11 + i5_27);
+			      r4_27 = (r5_11 - r5_27);
+			      i4_27 = (i5_11 - i5_27);
+			 }
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_19 = (r4_3 - r4_11);
+			 i3_19 = (i4_3 - i4_11);
+			 r3_11 = (r4_19 + i4_27);
+			 i3_11 = (i4_19 - r4_27);
+			 r3_27 = (r4_19 - i4_27);
+			 i3_27 = (i4_19 + r4_27);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 REAL r4_23, i4_23;
+			 REAL r4_31, i4_31;
+			 {
+			      REAL r5_7, i5_7;
+			      REAL r5_23, i5_23;
+			      r5_7 = c_re(in[7]);
+			      i5_7 = c_im(in[7]);
+			      r5_23 = c_re(in[23]);
+			      i5_23 = c_im(in[23]);
+			      r4_7 = (r5_7 + r5_23);
+			      i4_7 = (i5_7 + i5_23);
+			      r4_23 = (r5_7 - r5_23);
+			      i4_23 = (i5_7 - i5_23);
+			 }
+			 {
+			      REAL r5_15, i5_15;
+			      REAL r5_31, i5_31;
+			      r5_15 = c_re(in[15]);
+			      i5_15 = c_im(in[15]);
+			      r5_31 = c_re(in[31]);
+			      i5_31 = c_im(in[31]);
+			      r4_15 = (r5_15 + r5_31);
+			      i4_15 = (i5_15 + i5_31);
+			      r4_31 = (r5_15 - r5_31);
+			      i4_31 = (i5_15 - i5_31);
+			 }
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_23 = (r4_7 - r4_15);
+			 i3_23 = (i4_7 - i4_15);
+			 r3_15 = (r4_23 + i4_31);
+			 i3_15 = (i4_23 - r4_31);
+			 r3_31 = (r4_23 - i4_31);
+			 i3_31 = (i4_23 + r4_31);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_19 = (r3_3 - r3_7);
+		    i2_19 = (i3_3 - i3_7);
+		    tmpr = (0.707106781187 * (r3_15 + i3_15));
+		    tmpi = (0.707106781187 * (i3_15 - r3_15));
+		    r2_7 = (r3_11 + tmpr);
+		    i2_7 = (i3_11 + tmpi);
+		    r2_23 = (r3_11 - tmpr);
+		    i2_23 = (i3_11 - tmpi);
+		    r2_11 = (r3_19 + i3_23);
+		    i2_11 = (i3_19 - r3_23);
+		    r2_27 = (r3_19 - i3_23);
+		    i2_27 = (i3_19 + r3_23);
+		    tmpr = (0.707106781187 * (i3_31 - r3_31));
+		    tmpi = (0.707106781187 * (r3_31 + i3_31));
+		    r2_15 = (r3_27 + tmpr);
+		    i2_15 = (i3_27 - tmpi);
+		    r2_31 = (r3_27 - tmpr);
+		    i2_31 = (i3_27 + tmpi);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_17 = (r2_1 - r2_3);
+	       i1_17 = (i2_1 - i2_3);
+	       tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+	       tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_19 = (r2_5 - tmpr);
+	       i1_19 = (i2_5 - tmpi);
+	       tmpr = (0.707106781187 * (r2_11 + i2_11));
+	       tmpi = (0.707106781187 * (i2_11 - r2_11));
+	       r1_5 = (r2_9 + tmpr);
+	       i1_5 = (i2_9 + tmpi);
+	       r1_21 = (r2_9 - tmpr);
+	       i1_21 = (i2_9 - tmpi);
+	       tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+	       tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 + tmpi);
+	       r1_23 = (r2_13 - tmpr);
+	       i1_23 = (i2_13 - tmpi);
+	       r1_9 = (r2_17 + i2_19);
+	       i1_9 = (i2_17 - r2_19);
+	       r1_25 = (r2_17 - i2_19);
+	       i1_25 = (i2_17 + r2_19);
+	       tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+	       tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+	       r1_11 = (r2_21 + tmpr);
+	       i1_11 = (i2_21 - tmpi);
+	       r1_27 = (r2_21 - tmpr);
+	       i1_27 = (i2_21 + tmpi);
+	       tmpr = (0.707106781187 * (i2_27 - r2_27));
+	       tmpi = (0.707106781187 * (r2_27 + i2_27));
+	       r1_13 = (r2_25 + tmpr);
+	       i1_13 = (i2_25 - tmpi);
+	       r1_29 = (r2_25 - tmpr);
+	       i1_29 = (i2_25 + tmpi);
+	       tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+	       tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+	       r1_15 = (r2_29 + tmpr);
+	       i1_15 = (i2_29 - tmpi);
+	       r1_31 = (r2_29 - tmpr);
+	       i1_31 = (i2_29 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[16]) = (r1_0 - r1_1);
+	  c_im(out[16]) = (i1_0 - i1_1);
+	  tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+	  tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[17]) = (r1_2 - tmpr);
+	  c_im(out[17]) = (i1_2 - tmpi);
+	  tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+	  tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[18]) = (r1_4 - tmpr);
+	  c_im(out[18]) = (i1_4 - tmpi);
+	  tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+	  tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[19]) = (r1_6 - tmpr);
+	  c_im(out[19]) = (i1_6 - tmpi);
+	  tmpr = (0.707106781187 * (r1_9 + i1_9));
+	  tmpi = (0.707106781187 * (i1_9 - r1_9));
+	  c_re(out[4]) = (r1_8 + tmpr);
+	  c_im(out[4]) = (i1_8 + tmpi);
+	  c_re(out[20]) = (r1_8 - tmpr);
+	  c_im(out[20]) = (i1_8 - tmpi);
+	  tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+	  tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 + tmpi);
+	  c_re(out[21]) = (r1_10 - tmpr);
+	  c_im(out[21]) = (i1_10 - tmpi);
+	  tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+	  tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 + tmpi);
+	  c_re(out[22]) = (r1_12 - tmpr);
+	  c_im(out[22]) = (i1_12 - tmpi);
+	  tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+	  tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 + tmpi);
+	  c_re(out[23]) = (r1_14 - tmpr);
+	  c_im(out[23]) = (i1_14 - tmpi);
+	  c_re(out[8]) = (r1_16 + i1_17);
+	  c_im(out[8]) = (i1_16 - r1_17);
+	  c_re(out[24]) = (r1_16 - i1_17);
+	  c_im(out[24]) = (i1_16 + r1_17);
+	  tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+	  tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+	  c_re(out[9]) = (r1_18 + tmpr);
+	  c_im(out[9]) = (i1_18 - tmpi);
+	  c_re(out[25]) = (r1_18 - tmpr);
+	  c_im(out[25]) = (i1_18 + tmpi);
+	  tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+	  tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+	  c_re(out[10]) = (r1_20 + tmpr);
+	  c_im(out[10]) = (i1_20 - tmpi);
+	  c_re(out[26]) = (r1_20 - tmpr);
+	  c_im(out[26]) = (i1_20 + tmpi);
+	  tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+	  tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+	  c_re(out[11]) = (r1_22 + tmpr);
+	  c_im(out[11]) = (i1_22 - tmpi);
+	  c_re(out[27]) = (r1_22 - tmpr);
+	  c_im(out[27]) = (i1_22 + tmpi);
+	  tmpr = (0.707106781187 * (i1_25 - r1_25));
+	  tmpi = (0.707106781187 * (r1_25 + i1_25));
+	  c_re(out[12]) = (r1_24 + tmpr);
+	  c_im(out[12]) = (i1_24 - tmpi);
+	  c_re(out[28]) = (r1_24 - tmpr);
+	  c_im(out[28]) = (i1_24 + tmpi);
+	  tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+	  tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+	  c_re(out[13]) = (r1_26 + tmpr);
+	  c_im(out[13]) = (i1_26 - tmpi);
+	  c_re(out[29]) = (r1_26 - tmpr);
+	  c_im(out[29]) = (i1_26 + tmpi);
+	  tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+	  tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+	  c_re(out[14]) = (r1_28 + tmpr);
+	  c_im(out[14]) = (i1_28 - tmpi);
+	  c_re(out[30]) = (r1_28 - tmpr);
+	  c_im(out[30]) = (i1_28 + tmpi);
+	  tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+	  tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+	  c_re(out[15]) = (r1_30 + tmpr);
+	  c_im(out[15]) = (i1_30 - tmpi);
+	  c_re(out[31]) = (r1_30 - tmpr);
+	  c_im(out[31]) = (i1_30 + tmpi);
+     }
+}
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_32_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_32_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+          #pragma omp task untied
+	  fft_unshuffle_32(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_32(ab, b, in, out, m);
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_32_seq(a, ab, in, out, m);
+	  fft_unshuffle_32_seq(ab, b, in, out, m);
+     }
+}
+/* end of machine-generated code */
+
+/*
+ * Recursive complex FFT on the n complex components of the array in:
+ * basic Cooley-Tukey algorithm, with some improvements for
+ * n power of two. The result is placed in the array out. n is arbitrary. 
+ * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
+ * are prime numbers, and r1 * r2 * ... * rk = n.
+ *
+ * n: size of the input
+ * in: pointer to input
+ * out: pointer to output
+ * factors: list of factors of n, precomputed
+ * W: twiddle factors
+ * nW: size of W, that is, size of the original transform
+ *
+ */
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+	  if (r == 32) {
+               #pragma omp task untied
+	       fft_unshuffle_32(0, m, in, out, m);
+	  } else if (r == 16) {
+               #pragma omp task untied
+	       fft_unshuffle_16(0, m, in, out, m);
+	  } else if (r == 8) {
+               #pragma omp task untied
+	       fft_unshuffle_8(0, m, in, out, m);
+	  } else if (r == 4) {
+               #pragma omp task untied
+	       fft_unshuffle_4(0, m, in, out, m);
+	  } else if (r == 2) {
+               #pragma omp task untied
+	       fft_unshuffle_2(0, m, in, out, m);
+	  } else
+	       unshuffle(0, m, in, out, r, m);
+
+          #pragma omp taskwait
+
+	  for (k = 0; k < n; k += m) {
+               #pragma omp task untied
+	       fft_aux(m, out + k, in + k, factors + 1, W, nW);
+	  }
+          #pragma omp taskwait
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+     if (r == 2) {
+          #pragma omp task untied
+	  fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 4) {
+          #pragma omp task untied
+	  fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 8) {
+          #pragma omp task untied
+	  fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 16) {
+          #pragma omp task untied
+	  fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 32) {
+          #pragma omp task untied
+	  fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+     } else {
+          #pragma omp task untied
+	  fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+     }
+
+     #pragma omp taskwait
+
+     return;
+}
+
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+	  if      (r == 32) fft_unshuffle_32_seq(0, m, in, out, m);
+	  else if (r == 16) fft_unshuffle_16_seq(0, m, in, out, m);
+	  else if (r ==  8) fft_unshuffle_8_seq(0, m, in, out, m);
+	  else if (r ==  4) fft_unshuffle_4_seq(0, m, in, out, m);
+	  else if (r ==  2) fft_unshuffle_2_seq(0, m, in, out, m);
+	  else              unshuffle_seq(0, m, in, out, r, m);
+
+	  for (k = 0; k < n; k += m) {
+	       fft_aux_seq(m, out + k, in + k, factors + 1, W, nW);
+	  }
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+     if      (r ==  2) fft_twiddle_2_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  4) fft_twiddle_4_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  8) fft_twiddle_8_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 16) fft_twiddle_16_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 32) fft_twiddle_32_seq(0, m, in, out, W, nW, nW / n, m);
+     else              fft_twiddle_gen_seq(0, m, in, out, W, nW, nW / n, r, m);
+
+     return;
+}
+/*
+ * user interface for fft_aux
+ */
+void fft(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     bots_message("Computing coefficients ");
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     compute_w_coefficients(n, 0, n / 2, W);
+     bots_message(" completed!\n");
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     bots_message("Computing FFT ");
+     fft_aux(n, in, out, factors, W, n);
+     bots_message(" completed!\n");
+
+     free(W);
+     return;
+}
+void fft_seq(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     compute_w_coefficients_seq(n, 0, n / 2, W);
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     fft_aux_seq(n, in, out, factors, W, n);
+
+     free(W);
+     return;
+}
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2)
+{
+  int i;
+  double a,d,error = 0.0;
+
+  for (i = 0; i < n; ++i) {
+       a = sqrt((c_re(out1[i]) - c_re(out2[i])) *
+		(c_re(out1[i]) - c_re(out2[i])) +
+		(c_im(out1[i]) - c_im(out2[i])) *
+		(c_im(out1[i]) - c_im(out2[i])));
+       d =  sqrt(c_re(out2[i]) * c_re(out2[i]) + 
+		 c_im(out2[i]) * c_im(out2[i]));
+       if (d < -1.0e-10 || d > 1.0e-10) a /= d;
+       if (a > error) error = a;
+  }
+  bots_message("relative error=%e\n", error);
+  if (error > 1e-3) return BOTS_RESULT_UNSUCCESSFUL;
+  else return BOTS_RESULT_SUCCESSFUL;
+}
+
diff --git a/ompss/fft/fft.h b/ompss/fft/fft.h
new file mode 100644
index 0000000..ebafa9f
--- /dev/null
+++ b/ompss/fft/fft.h
@@ -0,0 +1,55 @@
+#ifndef FFT_H
+#define FFT_H
+
+/* our real numbers */
+typedef double REAL;
+
+/* Complex numbers and operations */
+typedef struct {
+     REAL re, im;
+} COMPLEX;
+
+#define c_re(c)  ((c).re)
+#define c_im(c)  ((c).im)
+
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W);
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W);
+int factor(int n);
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, COMPLEX * W, int r, int m, int nW, int nWdnti, int nWdntm);
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_base_2(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_4(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_8(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_16(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_32(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft(int n, COMPLEX * in, COMPLEX * out);
+void fft_seq(int n, COMPLEX * in, COMPLEX * out);
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2);
+
+#endif
+
diff --git a/ompss/fib/Makefile b/ompss/fib/Makefile
new file mode 100644
index 0000000..c26481b
--- /dev/null
+++ b/ompss/fib/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#LIBS = 
+#PROGRAM_OBJS=
+
+CUTOFF_VERSIONS = manual if_clause final 
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/fib/app-desc.h b/ompss/fib/app-desc.h
new file mode 100644
index 0000000..9b9f71b
--- /dev/null
+++ b/ompss/fib/app-desc.h
@@ -0,0 +1,47 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+
+#define BOTS_APP_NAME "Fibonacci"
+#define BOTS_APP_PARAMETERS_DESC "N=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 10
+#define BOTS_APP_DESC_ARG_SIZE "Number to compute"
+
+int fib_verify(int);
+void fib0 (int);
+void fib0_seq (int);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL fib0(bots_arg_size)
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL fib0_seq(bots_arg_size)
+//#define KERNEL_SEQ_FINI
+
+
+#define KERNEL_CHECK fib_verify(bots_arg_size)
+
+#define BOTS_CUTOFF_DEF_VALUE 10
+
diff --git a/ompss/fib/fib.c b/ompss/fib/fib.c
new file mode 100644
index 0000000..828778c
--- /dev/null
+++ b/ompss/fib/fib.c
@@ -0,0 +1,155 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "bots.h"
+#include "fib.h"
+
+#define FIB_RESULTS_PRE 41
+long long fib_results[FIB_RESULTS_PRE] = {0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155};
+
+long long fib_seq (int n)
+{
+	int x, y;
+	if (n < 2) return n;
+
+	x = fib_seq(n - 1);
+	y = fib_seq(n - 2);
+
+	return x + y;
+}
+
+#if defined(IF_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) if(d < bots_cutoff_value)
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) if(d < bots_cutoff_value)
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+long long fib (int n, int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	if ( d < bots_cutoff_value ) {
+		#pragma omp task untied shared(x) firstprivate(n)
+		x = fib(n - 1,d+1);
+
+		#pragma omp task untied shared(y) firstprivate(n)
+		y = fib(n - 2,d+1);
+
+		#pragma omp taskwait
+	} else {
+		x = fib_seq(n-1);
+		y = fib_seq(n-2);
+	}
+
+	return x + y;
+}
+
+#else
+
+long long fib (int n)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n)
+	x = fib(n - 1);
+	#pragma omp task untied shared(y) firstprivate(n)
+	y = fib(n - 2);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#endif
+
+static long long par_res, seq_res;
+
+void fib0 (int n)
+{
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+	par_res = fib(n,0);
+#else
+	par_res = fib(n);
+#endif
+	bots_message("Fibonacci result for %d is %lld\n",n,par_res);
+}
+
+void fib0_seq (int n)
+{
+	seq_res = fib_seq(n);
+	bots_message("Fibonacci result for %d is %lld\n",n,seq_res);
+}
+
+long long fib_verify_value(int n)
+{
+	if (n < FIB_RESULTS_PRE) return fib_results[n];
+	return ( fib_verify_value(n-1) + fib_verify_value(n-2));
+}
+
+int fib_verify (int n)
+{
+	int result;
+
+	if (bots_sequential_flag)
+	{
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+	else
+	{
+		seq_res = fib_verify_value(n);
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+
+	return result;
+}
+
diff --git a/ompss/fib/fib.h b/ompss/fib/fib.h
new file mode 100644
index 0000000..e3d2983
--- /dev/null
+++ b/ompss/fib/fib.h
@@ -0,0 +1,40 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef FIB_H
+#define FIB_H
+#if defined(IF_CUTOFF)
+long long fib (int n,int d);
+#elif defined(FINAL_CUTOFF)
+long long fib (int n,int d);
+#elif defined(MANUAL_CUTOFF)
+long long fib (int n,int d);
+#else
+long long fib (int n);
+#endif
+
+long long fib_seq (int n);
+
+void fib0 (int n);
+void fib0_seq (int n);
+
+int fib_verify (int n);
+long long fib_verify_value(int n);
+#endif
+
diff --git a/ompss/floorplan/Makefile b/ompss/floorplan/Makefile
new file mode 100644
index 0000000..c26481b
--- /dev/null
+++ b/ompss/floorplan/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#LIBS = 
+#PROGRAM_OBJS=
+
+CUTOFF_VERSIONS = manual if_clause final 
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/floorplan/app-desc.h b/ompss/floorplan/app-desc.h
new file mode 100644
index 0000000..a3bba17
--- /dev/null
+++ b/ompss/floorplan/app-desc.h
@@ -0,0 +1,43 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+
+#define BOTS_APP_NAME "Floorplan"
+#define BOTS_APP_PARAMETERS_DESC "%s"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file
+
+#define BOTS_APP_USES_ARG_FILE
+#define BOTS_APP_DESC_ARG_FILE "Cell description file (mandatory)"
+
+#define BOTS_CUTOFF_DEF_VALUE 5
+
+void floorplan_init(char *);
+void floorplan_end (void);
+void compute_floorplan(void);
+int floorplan_verify(void);
+
+#define KERNEL_INIT floorplan_init(bots_arg_file)
+#define KERNEL_CALL compute_floorplan()
+#define KERNEL_FINI floorplan_end()
+
+#define KERNEL_CHECK floorplan_verify()
+
+
diff --git a/ompss/floorplan/floorplan.c b/ompss/floorplan/floorplan.c
new file mode 100644
index 0000000..83bdda9
--- /dev/null
+++ b/ompss/floorplan/floorplan.c
@@ -0,0 +1,644 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/* Original code from the Application Kernel Matrix by Cray */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "app-desc.h"
+#include "bots.h"
+
+#define ROWS 64
+#define COLS 64
+#define DMAX 64
+#define max(a, b) ((a > b) ? a : b)
+#define min(a, b) ((a < b) ? a : b)
+
+int solution = -1;
+
+typedef int  coor[2];
+typedef char ibrd[ROWS][COLS];
+typedef char (*pibrd)[COLS];
+
+FILE * inputFile;
+
+struct cell {
+  int   n;
+  coor *alt;
+  int   top;
+  int   bot;
+  int   lhs;
+  int   rhs;
+  int   left;
+  int   above;
+  int   next;
+};
+
+struct cell * gcells;
+
+int  MIN_AREA;
+ibrd BEST_BOARD;
+coor MIN_FOOTPRINT;
+
+int N;
+
+/* compute all possible locations for nw corner for cell */
+static int starts(int id, int shape, coor *NWS, struct cell *cells) {
+  int i, n, top, bot, lhs, rhs;
+  int rows, cols, left, above;
+
+/* size of cell */
+  rows  = cells[id].alt[shape][0];
+  cols  = cells[id].alt[shape][1];
+
+/* the cells to the left and above */
+  left  = cells[id].left;
+  above = cells[id].above;
+
+/* if there is a vertical and horizontal dependence */
+  if ((left >= 0) && (above >= 0)) {
+
+     top = cells[above].bot + 1;
+     lhs = cells[left].rhs + 1;
+     bot = top + rows;
+     rhs = lhs + cols;
+
+/* if footprint of cell touches the cells to the left and above */
+     if ((top <= cells[left].bot) && (bot >= cells[left].top) &&
+         (lhs <= cells[above].rhs) && (rhs >= cells[above].lhs))
+          { n = 1; NWS[0][0] = top; NWS[0][1] = lhs;  }
+     else { n = 0; }
+
+/* if there is only a horizontal dependence */
+   } else if (left >= 0) {
+
+/* highest initial row is top of cell to the left - rows */ 
+     top = max(cells[left].top - rows + 1, 0);
+/* lowest initial row is bottom of cell to the left */
+     bot = min(cells[left].bot, ROWS);
+     n   = bot - top + 1;
+
+     for (i = 0; i < n; i++) {
+         NWS[i][0] = i + top;
+         NWS[i][1] = cells[left].rhs + 1;
+     }
+
+  } else {
+
+/* leftmost initial col is lhs of cell above - cols */
+     lhs = max(cells[above].lhs - cols + 1, 0);
+/* rightmost initial col is rhs of cell above */
+     rhs = min(cells[above].rhs, COLS);
+     n   = rhs - lhs + 1;
+
+     for (i = 0; i < n; i++) {
+         NWS[i][0] = cells[above].bot + 1;
+         NWS[i][1] = i + lhs;
+  }  }
+
+  return (n);
+}
+
+
+
+/* lay the cell down on the board in the rectangular space defined
+   by the cells top, bottom, left, and right edges. If the cell can
+   not be layed down, return 0; else 1.
+*/
+static int lay_down(int id, ibrd board, struct cell *cells) {
+  int  i, j, top, bot, lhs, rhs;
+
+  top = cells[id].top;
+  bot = cells[id].bot;
+  lhs = cells[id].lhs;
+  rhs = cells[id].rhs;
+
+  for (i = top; i <= bot; i++) {
+  for (j = lhs; j <= rhs; j++) {
+      if (board[i][j] == 0) board[i][j] = (char)id;
+      else                  return(0);
+  } }
+
+  return (1);
+}
+
+
+#define read_integer(file,var) \
+  if ( fscanf(file, "%d", &var) == EOF ) {\
+	bots_message(" Bogus input file\n");\
+	exit(-1);\
+  }
+
+static void read_inputs() {
+  int i, j, n;
+
+  read_integer(inputFile,n);
+  N = n;
+  
+  gcells = (struct cell *) malloc((n + 1) * sizeof(struct cell));
+
+  gcells[0].n     =  0;
+  gcells[0].alt   =  0;
+  gcells[0].top   =  0;
+  gcells[0].bot   =  0;
+  gcells[0].lhs   = -1;
+  gcells[0].rhs   = -1;
+  gcells[0].left  =  0;
+  gcells[0].above =  0;
+  gcells[0].next  =  0;
+
+  for (i = 1; i < n + 1; i++) {
+
+      read_integer(inputFile, gcells[i].n);
+      gcells[i].alt = (coor *) malloc(gcells[i].n * sizeof(coor));
+
+      for (j = 0; j < gcells[i].n; j++) {
+          read_integer(inputFile, gcells[i].alt[j][0]);
+          read_integer(inputFile, gcells[i].alt[j][1]);
+      }
+
+      read_integer(inputFile, gcells[i].left);
+      read_integer(inputFile, gcells[i].above);
+      read_integer(inputFile, gcells[i].next);
+      }
+
+  if (!feof(inputFile)) {
+      read_integer(inputFile, solution);
+  }
+}
+
+
+static void write_outputs() {
+  int i, j;
+
+    bots_message("Minimum area = %d\n\n", MIN_AREA);
+
+    for (i = 0; i < MIN_FOOTPRINT[0]; i++) {
+      for (j = 0; j < MIN_FOOTPRINT[1]; j++) {
+          if (BEST_BOARD[i][j] == 0) {bots_message(" ");}
+          else                       bots_message("%c", 'A' + BEST_BOARD[i][j] - 1);
+      } 
+      bots_message("\n");
+    }  
+}
+
+#ifdef MANUAL_CUTOFF
+static int add_cell_ser (int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS) {
+  int  i, j, nn, nn2, area;
+
+  ibrd board;
+  coor footprint, NWS[DMAX];
+
+  nn2 = 0;
+
+/* for each possible shape */
+  for (i = 0; i < CELLS[id].n; i++) {
+/* compute all possible locations for nw corner */
+      nn = starts(id, i, NWS, CELLS);
+      nn2 += nn;
+/* for all possible locations */
+      for (j = 0; j < nn; j++) {
+	  struct cell *cells = CELLS;
+/* extent of shape */
+          cells[id].top = NWS[j][0];
+          cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1;
+          cells[id].lhs = NWS[j][1];
+          cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1;
+
+          memcpy(board, BOARD, sizeof(ibrd));
+
+/* if the cell cannot be layed down, prune search */
+          if (! lay_down(id, board, cells)) {
+             bots_debug("Chip %d, shape %d does not fit\n", id, i);
+             goto _end;
+          }
+
+/* calculate new footprint of board and area of footprint */
+          footprint[0] = max(FOOTPRINT[0], cells[id].bot+1);
+          footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1);
+          area         = footprint[0] * footprint[1];
+
+/* if last cell */
+          if (cells[id].next == 0) {
+
+/* if area is minimum, update global values */
+		  if (area < MIN_AREA) {
+#pragma omp critical
+			  if (area < MIN_AREA) {
+				  MIN_AREA         = area;
+				  MIN_FOOTPRINT[0] = footprint[0];
+				  MIN_FOOTPRINT[1] = footprint[1];
+				  memcpy(BEST_BOARD, board, sizeof(ibrd));
+				  bots_debug("N  %d\n", MIN_AREA);
+			  }
+		  }
+
+/* if area is less than best area */
+          } else if (area < MIN_AREA) {
+            #pragma omp atomic
+             nn2 += add_cell_ser(cells[id].next, footprint, board,cells);
+
+/* if area is greater than or equal to best area, prune search */
+          } else {
+
+             bots_debug("T  %d, %d\n", area, MIN_AREA);
+ 
+	  }
+_end:;  
+}
+}
+  return nn2;
+  }
+#endif
+
+#if defined(IF_CUTOFF)
+
+static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS,int level) {
+  int  i, j, nn, area, nnc, nnl;
+
+  ibrd board;
+  coor footprint, NWS[DMAX];
+ 
+  nnc = nnl = 0;
+
+/* for each possible shape */
+  for (i = 0; i < CELLS[id].n; i++) {
+/* compute all possible locations for nw corner */
+      nn = starts(id, i, NWS, CELLS);
+      nnl += nn;
+/* for all possible locations */
+      for (j = 0; j < nn; j++) {
+#pragma omp task untied private(board, footprint,area) \
+        firstprivate(NWS,i,j,id,nn,level) \
+        shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,nnc,bots_verbose_mode) \
+        if(level<bots_cutoff_value)
+{
+	  struct cell cells[N+1];
+	  memcpy(cells,CELLS,sizeof(struct cell)*(N+1));
+/* extent of shape */
+          cells[id].top = NWS[j][0];
+          cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1;
+          cells[id].lhs = NWS[j][1];
+          cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1;
+
+          memcpy(board, BOARD, sizeof(ibrd));
+
+/* if the cell cannot be layed down, prune search */
+          if (! lay_down(id, board, cells)) {
+             bots_debug("Chip %d, shape %d does not fit\n", id, i);
+             goto _end;
+          }
+
+/* calculate new footprint of board and area of footprint */
+          footprint[0] = max(FOOTPRINT[0], cells[id].bot+1);
+          footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1);
+          area         = footprint[0] * footprint[1];
+
+/* if last cell */
+          if (cells[id].next == 0) {
+
+/* if area is minimum, update global values */
+		  if (area < MIN_AREA) {
+#pragma omp critical
+			  if (area < MIN_AREA) {
+				  MIN_AREA         = area;
+				  MIN_FOOTPRINT[0] = footprint[0];
+				  MIN_FOOTPRINT[1] = footprint[1];
+				  memcpy(BEST_BOARD, board, sizeof(ibrd));
+				  bots_debug("N  %d\n", MIN_AREA);
+			  }
+		  }
+
+/* if area is less than best area */
+          } else if (area < MIN_AREA) {
+ 	    #pragma omp atomic
+                nnc += add_cell(cells[id].next, footprint, board,cells,level+1);
+/* if area is greater than or equal to best area, prune search */
+          } else {
+
+             bots_debug("T  %d, %d\n", area, MIN_AREA);
+
+	  }
+_end:;
+}
+      }
+}
+#pragma omp taskwait
+return nnc+nnl;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS,int level) {
+  int  i, j, nn, area, nnc, nnl;
+
+  coor footprint, NWS[DMAX];
+
+  nnc = nnl = 0;
+
+/* for each possible shape */
+  for (i = 0; i < CELLS[id].n; i++) {
+/* compute all possible locations for nw corner */
+      nn = starts(id, i, NWS, CELLS);
+      nnl += nn;
+/* for all possible locations */
+      for (j = 0; j < nn; j++) {
+#pragma omp task untied private(footprint,area) \
+        firstprivate(NWS,i,j,id,nn,level,bots_cutoff_value) \
+        shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,nnc,bots_verbose_mode) \
+        final(level >= bots_cutoff_value) mergeable
+{
+          ibrd board;
+          struct cell *cells;
+       
+          if ( omp_in_final() && level > bots_cutoff_value ) {
+            cells = CELLS;
+          } else {
+            cells = alloca(sizeof(struct cell)*(N+1));
+	    memcpy(cells,CELLS,sizeof(struct cell)*(N+1));
+          }
+
+/* extent of shape */
+          cells[id].top = NWS[j][0];
+          cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1;
+          cells[id].lhs = NWS[j][1];
+          cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1;
+
+          memcpy(board, BOARD, sizeof(ibrd));
+
+/* if the cell cannot be layed down, prune search */
+          if (! lay_down(id, board, cells)) {
+             bots_debug("Chip %d, shape %d does not fit\n", id, i);
+             goto _end;
+          }
+
+/* calculate new footprint of board and area of footprint */
+          footprint[0] = max(FOOTPRINT[0], cells[id].bot+1);
+          footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1);
+          area         = footprint[0] * footprint[1];
+
+/* if last cell */
+          if (cells[id].next == 0) {
+
+/* if area is minimum, update global values */
+		  if (area < MIN_AREA) {
+#pragma omp critical
+			  if (area < MIN_AREA) {
+				  MIN_AREA         = area;
+				  MIN_FOOTPRINT[0] = footprint[0];
+				  MIN_FOOTPRINT[1] = footprint[1];
+				  memcpy(BEST_BOARD, board, sizeof(ibrd));
+				  bots_debug("N  %d\n", MIN_AREA);
+			  }
+		  }
+
+/* if area is less than best area */
+          } else if (area < MIN_AREA) {
+ 	    #pragma omp atomic
+                nnc += add_cell(cells[id].next, footprint, board,cells,level+1);
+/* if area is greater than or equal to best area, prune search */
+          } else {
+
+             bots_debug("T  %d, %d\n", area, MIN_AREA);
+
+	  }
+_end:;
+}
+      }
+}
+#pragma omp taskwait
+return nnc+nnl;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS,int level) {
+  int  i, j, nn, area, nnc, nnl;
+
+  ibrd board;
+  coor footprint, NWS[DMAX];
+
+  nnc = nnl = 0;
+
+/* for each possible shape */
+  for (i = 0; i < CELLS[id].n; i++) {
+/* compute all possible locations for nw corner */
+      nn = starts(id, i, NWS, CELLS);
+      nnl += nn;
+/* for all possible locations */
+      for (j = 0; j < nn; j++) {
+#pragma omp task untied private(board, footprint,area) \
+        firstprivate(NWS,i,j,id,nn,level,bots_cutoff_value) shared(nnc) \
+        shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,bots_verbose_mode)
+{
+          struct cell *cells;
+          
+          cells = alloca(sizeof(struct cell)*(N+1));
+          memcpy(cells,CELLS,sizeof(struct cell)*(N+1));
+
+/* extent of shape */
+          cells[id].top = NWS[j][0];
+          cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1;
+          cells[id].lhs = NWS[j][1];
+          cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1;
+
+          memcpy(board, BOARD, sizeof(ibrd));
+
+/* if the cell cannot be layed down, prune search */
+          if (! lay_down(id, board, cells)) {
+             bots_debug("Chip %d, shape %d does not fit\n", id, i);
+             goto _end;
+          }
+
+/* calculate new footprint of board and area of footprint */
+          footprint[0] = max(FOOTPRINT[0], cells[id].bot+1);
+          footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1);
+          area         = footprint[0] * footprint[1];
+
+/* if last cell */
+          if (cells[id].next == 0) {
+
+/* if area is minimum, update global values */
+		  if (area < MIN_AREA) {
+#pragma omp critical
+			  if (area < MIN_AREA) {
+				  MIN_AREA         = area;
+				  MIN_FOOTPRINT[0] = footprint[0];
+				  MIN_FOOTPRINT[1] = footprint[1];
+				  memcpy(BEST_BOARD, board, sizeof(ibrd));
+				  bots_debug("N  %d\n", MIN_AREA);
+			  }
+		  }
+
+/* if area is less than best area */
+          } else if (area < MIN_AREA) {
+	     if(level+1 < bots_cutoff_value ) {
+ 	       #pragma omp atomic
+                nnc += add_cell(cells[id].next, footprint, board,cells,level+1);
+	     } else {
+ 	       #pragma omp atomic
+		nnc += add_cell_ser(cells[id].next, footprint, board,cells);
+	     }
+/* if area is greater than or equal to best area, prune search */
+          } else {
+             bots_debug("T  %d, %d\n", area, MIN_AREA);
+	  }
+_end:;
+}
+      }
+}
+#pragma omp taskwait
+  
+return nnc+nnl;
+}
+
+#else
+
+static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS) {
+  int  i, j, nn, area, nnc,nnl;
+
+  ibrd board;
+  coor footprint, NWS[DMAX];
+
+  nnc = nnl = 0;
+
+/* for each possible shape */
+  for (i = 0; i < CELLS[id].n; i++) {
+/* compute all possible locations for nw corner */
+      nn = starts(id, i, NWS, CELLS);
+      nnl += nn;
+/* for all possible locations */
+      for (j = 0; j < nn; j++) {
+#pragma omp task untied private(board, footprint,area) \
+        firstprivate(NWS,i,j,id,nn) \
+        shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,nnc,bots_verbose_mode) 
+{
+	  struct cell cells[N+1];
+	  memcpy(cells,CELLS,sizeof(struct cell)*(N+1));
+/* extent of shape */
+          cells[id].top = NWS[j][0];
+          cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1;
+          cells[id].lhs = NWS[j][1];
+          cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1;
+
+          memcpy(board, BOARD, sizeof(ibrd));
+
+/* if the cell cannot be layed down, prune search */
+          if (! lay_down(id, board, cells)) {
+             bots_debug("Chip %d, shape %d does not fit\n", id, i);
+             goto _end;
+          }
+
+/* calculate new footprint of board and area of footprint */
+          footprint[0] = max(FOOTPRINT[0], cells[id].bot+1);
+          footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1);
+          area         = footprint[0] * footprint[1];
+
+/* if last cell */
+          if (cells[id].next == 0) {
+
+/* if area is minimum, update global values */
+		  if (area < MIN_AREA) {
+#pragma omp critical
+			  if (area < MIN_AREA) {
+				  MIN_AREA         = area;
+				  MIN_FOOTPRINT[0] = footprint[0];
+				  MIN_FOOTPRINT[1] = footprint[1];
+				  memcpy(BEST_BOARD, board, sizeof(ibrd));
+				  bots_debug("N  %d\n", MIN_AREA);
+			  }
+		  }
+
+/* if area is less than best area */
+          } else if (area < MIN_AREA) {
+ 	    #pragma omp atomic
+ 	      nnc += add_cell(cells[id].next, footprint, board,cells);
+/* if area is greater than or equal to best area, prune search */
+          } else {
+
+             bots_debug("T  %d, %d\n", area, MIN_AREA);
+ 
+	  }
+_end:;  
+}
+      }
+}
+#pragma omp taskwait
+return nnc+nnl;
+}
+
+#endif
+
+ibrd board;
+
+void floorplan_init (char *filename)
+{
+    int i,j;
+
+    inputFile = fopen(filename, "r");
+    
+    if(NULL == inputFile) {
+        bots_message("Couldn't open %s file for reading\n", filename);
+        exit(1);
+    }
+    
+    /* read input file and initialize global minimum area */
+    read_inputs();
+    MIN_AREA = ROWS * COLS;
+    
+    /* initialize board is empty */
+    for (i = 0; i < ROWS; i++)
+    for (j = 0; j < COLS; j++) board[i][j] = 0;
+    
+}
+
+void compute_floorplan (void)
+{
+    coor footprint;
+    /* footprint of initial board is zero */
+    footprint[0] = 0;
+    footprint[1] = 0;
+
+    bots_message("Computing floorplan ");
+
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+       bots_number_of_tasks = add_cell(1, footprint, board, gcells, 0);
+#else
+       bots_number_of_tasks = add_cell(1, footprint, board, gcells);
+#endif
+
+    bots_message(" completed!\n");
+
+}
+
+void floorplan_end (void)
+{
+    /* write results */
+    write_outputs();
+}
+
+int floorplan_verify (void)
+{
+    if (solution != -1 )
+      return MIN_AREA == solution ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL;
+    else
+      return BOTS_RESULT_NA;
+}
diff --git a/ompss/health/Makefile b/ompss/health/Makefile
new file mode 100644
index 0000000..85e92c5
--- /dev/null
+++ b/ompss/health/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+LIBS = -lm
+#PROGRAM_OBJS=
+
+CUTOFF_VERSIONS = manual if_clause
+TIED_VERSIONS = YES
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/health/app-desc.h b/ompss/health/app-desc.h
new file mode 100644
index 0000000..f654474
--- /dev/null
+++ b/ompss/health/app-desc.h
@@ -0,0 +1,52 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+#include "health.h"
+
+#define BOTS_APP_NAME "Health"
+#define BOTS_APP_PARAMETERS_DESC "%s"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file
+
+//#define BOTS_APP_SELF_TIMING
+
+#define BOTS_APP_USES_ARG_FILE
+#define BOTS_APP_DEF_ARG_FILE "Input filename"
+#define BOTS_APP_DESC_ARG_FILE "Health input file (mandatory)"
+
+#define BOTS_CUTOFF_DEF_VALUE 2
+
+#define BOTS_APP_INIT \
+   struct Village *top;\
+   read_input_data(bots_arg_file);
+
+#define KERNEL_INIT \
+   allocate_village(&top, NULL, NULL, sim_level, 0);
+
+#define KERNEL_CALL sim_village_main_par(top);
+ 
+#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+//#define KERNEL_SEQ_CALL
+//#define KERNEL_SEQ_FINI
+
+#define KERNEL_CHECK check_village(top);
+
diff --git a/ompss/health/health.c b/ompss/health/health.c
new file mode 100644
index 0000000..7f670c2
--- /dev/null
+++ b/ompss/health/health.c
@@ -0,0 +1,637 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+
+/* OLDEN parallel C for dynamic structures: compiler, runtime system
+ * and benchmarks
+ *       
+ * Copyright (C) 1994-1996 by Anne Rogers (amr@cs.princeton.edu) and
+ * Martin Carlisle (mcc@cs.princeton.edu)
+ * ALL RIGHTS RESERVED.
+ *
+ * OLDEN is distributed under the following conditions:
+ *
+ * You may make copies of OLDEN for your own use and modify those copies.
+ *
+ * All copies of OLDEN must retain our names and copyright notice.
+ *
+ * You may not sell OLDEN or distribute OLDEN in conjunction with a
+ * commercial product or service without the expressed written consent of
+ * Anne Rogers and Martin Carlisle.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE.
+ *
+ */
+
+
+/******************************************************************* 
+ *  Health.c : Model of the Colombian Health Care System           *
+ *******************************************************************/ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include "app-desc.h"
+#include "bots.h"
+#include "health.h"
+
+/* global variables */
+int sim_level;
+int sim_cities;
+int sim_population_ratio;
+int sim_time;
+int sim_assess_time;
+int sim_convalescence_time;
+int32_t sim_seed;
+float sim_get_sick_p;
+float sim_convalescence_p;
+float sim_realloc_p;
+int sim_pid = 0;
+
+int res_population;
+int res_hospitals;
+int res_personnel;
+int res_checkin;
+int res_village;
+int res_waiting;
+int res_assess;
+int res_inside;
+float res_avg_stay;
+
+/**********************************************************
+ * Handles math routines for health.c                     *
+ **********************************************************/
+float my_rand(int32_t *seed) 
+{
+   int32_t k;
+   int32_t idum = *seed;
+
+   idum ^= MASK;
+   k = idum / IQ;
+   idum = IA * (idum - k * IQ) - IR * k;
+   idum ^= MASK;
+   if (idum < 0) idum  += IM;
+   *seed = idum * IM;
+   return (float) AM * idum;
+}
+/********************************************************************
+ * Handles lists.                                                   *
+ ********************************************************************/
+void addList(struct Patient **list, struct Patient *patient)
+{
+   if (*list == NULL)
+   {
+      *list = patient;
+      patient->back = NULL;
+      patient->forward = NULL;
+   }
+   else
+   {
+      struct Patient *aux = *list;
+      while (aux->forward != NULL) aux = aux->forward; 
+      aux->forward = patient;
+      patient->back = aux;
+      patient->forward = NULL;
+   }
+} 
+void removeList(struct Patient **list, struct Patient *patient) 
+{
+#if 0
+   struct Patient *aux = *list;
+  
+   if (patient == NULL) return;
+   while((aux != NULL) && (aux != patient)) aux = aux->forward; 
+
+   // Patient not found
+   if (aux == NULL) return;
+
+   // Removing patient
+   if (aux->back != NULL) aux->back->forward = aux->forward;
+   else *list = aux->forward;
+   if (aux->forward != NULL) aux->forward->back = aux->back;
+#else
+   if (patient->back != NULL) patient->back->forward = patient->forward;
+   else *list = patient->forward;
+   if (patient->forward != NULL) patient->forward->back = patient->back;
+#endif
+}
+/**********************************************************************/
+void allocate_village( struct Village **capital, struct Village *back,
+   struct Village *next, int level, int32_t vid)
+{ 
+   int i, population, personnel;
+   struct Village *current, *inext;
+   struct Patient *patient;
+
+   if (level == 0) *capital = NULL;
+   else
+   {
+      personnel = (int) pow(2, level);
+      population = personnel * sim_population_ratio;
+      /* Allocate Village */
+      *capital = (struct Village *) malloc(sizeof(struct Village));
+      /* Initialize Village */
+      (*capital)->back  = back;
+      (*capital)->next  = next;
+      (*capital)->level = level;
+      (*capital)->id    = vid;
+      (*capital)->seed  = vid * (IQ + sim_seed);
+      (*capital)->population = NULL;
+      for(i=0;i<population;i++)
+      {
+         patient = (struct Patient *)malloc(sizeof(struct Patient));
+         patient->id = sim_pid++;
+         patient->seed = (*capital)->seed;
+         // changes seed for capital:
+         my_rand(&((*capital)->seed));
+         patient->hosps_visited = 0;
+         patient->time          = 0;
+         patient->time_left     = 0;
+         patient->home_village = *capital; 
+         addList(&((*capital)->population), patient);
+      }
+      /* Initialize Hospital */
+      (*capital)->hosp.personnel = personnel;
+      (*capital)->hosp.free_personnel = personnel;
+      (*capital)->hosp.assess = NULL;
+      (*capital)->hosp.waiting = NULL;
+      (*capital)->hosp.inside = NULL;
+      (*capital)->hosp.realloc = NULL;
+      omp_init_lock(&(*capital)->hosp.realloc_lock);
+      // Create Cities (lower level)
+      inext = NULL;
+      for (i = sim_cities; i>0; i--)
+      {
+         allocate_village(&current, *capital, inext, level-1, (vid * (int32_t) sim_cities)+ (int32_t) i);
+         inext = current;
+      }
+      (*capital)->forward = current;
+   }
+}
+/**********************************************************************/
+struct Results get_results(struct Village *village)
+{
+   struct Village *vlist;
+   struct Patient *p;
+   struct Results t_res, p_res;
+
+   t_res.hosps_number     = 0.0;
+   t_res.hosps_personnel  = 0.0;
+   t_res.total_patients   = 0.0;
+   t_res.total_in_village = 0.0;
+   t_res.total_waiting    = 0.0;
+   t_res.total_assess     = 0.0;
+   t_res.total_inside     = 0.0;
+   t_res.total_hosps_v    = 0.0;
+   t_res.total_time       = 0.0;
+
+   if (village == NULL) return t_res;
+
+   /* Traverse village hierarchy (lower level first)*/
+   vlist = village->forward;
+   while(vlist)
+   {
+      p_res = get_results(vlist);
+      t_res.hosps_number     += p_res.hosps_number;
+      t_res.hosps_personnel  += p_res.hosps_personnel;
+      t_res.total_patients   += p_res.total_patients;
+      t_res.total_in_village += p_res.total_in_village;
+      t_res.total_waiting    += p_res.total_waiting;
+      t_res.total_assess     += p_res.total_assess;
+      t_res.total_inside     += p_res.total_inside;
+      t_res.total_hosps_v    += p_res.total_hosps_v;
+      t_res.total_time       += p_res.total_time;
+      vlist = vlist->next;
+   }
+   t_res.hosps_number     += 1.0;
+   t_res.hosps_personnel  += village->hosp.personnel;
+
+   // Patients in the village
+   p = village->population;
+   while (p != NULL) 
+   {
+      t_res.total_patients   += 1.0;
+      t_res.total_in_village += 1.0;
+      t_res.total_hosps_v    += (float)(p->hosps_visited);
+      t_res.total_time       += (float)(p->time); 
+      p = p->forward; 
+   }
+   // Patients in hospital: waiting
+   p = village->hosp.waiting;
+   while (p != NULL) 
+   {
+      t_res.total_patients += 1.0;
+      t_res.total_waiting  += 1.0;
+      t_res.total_hosps_v  += (float)(p->hosps_visited);
+      t_res.total_time     += (float)(p->time); 
+      p = p->forward; 
+   }
+   // Patients in hospital: assess
+   p = village->hosp.assess;
+   while (p != NULL) 
+   {
+      t_res.total_patients += 1.0;
+      t_res.total_assess   += 1.0;
+      t_res.total_hosps_v  += (float)(p->hosps_visited);
+      t_res.total_time     += (float)(p->time); 
+      p = p->forward; 
+   }
+   // Patients in hospital: inside
+   p = village->hosp.inside;
+   while (p != NULL) 
+   {
+      t_res.total_patients += 1.0;
+      t_res.total_inside   += 1.0;
+      t_res.total_hosps_v  += (float)(p->hosps_visited);
+      t_res.total_time     += (float)(p->time); 
+      p = p->forward; 
+   }  
+
+   return t_res; 
+}
+/**********************************************************************/
+/**********************************************************************/
+/**********************************************************************/
+void check_patients_inside(struct Village *village) 
+{
+   struct Patient *list = village->hosp.inside;
+   struct Patient *p;
+  
+   while (list != NULL)
+   {
+      p = list;
+      list = list->forward; 
+      p->time_left--;
+      if (p->time_left == 0) 
+      {
+         village->hosp.free_personnel++;
+         removeList(&(village->hosp.inside), p); 
+         addList(&(village->population), p); 
+      }    
+   }
+}
+/**********************************************************************/
+void check_patients_assess_par(struct Village *village) 
+{
+   struct Patient *list = village->hosp.assess;
+   float rand;
+   struct Patient *p;
+
+   while (list != NULL) 
+   {
+      p = list;
+      list = list->forward; 
+      p->time_left--;
+
+      if (p->time_left == 0) 
+      { 
+         rand = my_rand(&(p->seed));
+         /* sim_covalescense_p % */
+         if (rand < sim_convalescence_p)
+         {
+            rand = my_rand(&(p->seed));
+            /* !sim_realloc_p % or root hospital */
+            if (rand > sim_realloc_p || village->level == sim_level) 
+            {
+               removeList(&(village->hosp.assess), p);
+               addList(&(village->hosp.inside), p);
+               p->time_left = sim_convalescence_time;
+               p->time += p->time_left;
+            }
+            else /* move to upper level hospital !!! */
+            {
+               village->hosp.free_personnel++;
+               removeList(&(village->hosp.assess), p);
+               omp_set_lock(&(village->hosp.realloc_lock));
+               addList(&(village->back->hosp.realloc), p); 
+               omp_unset_lock(&(village->hosp.realloc_lock));
+            } 
+         }
+         else /* move to village */
+         {
+            village->hosp.free_personnel++;
+            removeList(&(village->hosp.assess), p);
+            addList(&(village->population), p); 
+         }
+      }
+   } 
+}
+/**********************************************************************/
+void check_patients_waiting(struct Village *village) 
+{
+   struct Patient *list = village->hosp.waiting;
+   struct Patient *p;
+  
+   while (list != NULL) 
+   {
+      p = list;
+      list = list->forward; 
+      if (village->hosp.free_personnel > 0) 
+      {
+         village->hosp.free_personnel--;
+         p->time_left = sim_assess_time;
+         p->time += p->time_left;
+         removeList(&(village->hosp.waiting), p);
+         addList(&(village->hosp.assess), p); 
+      }
+      else 
+      {
+         p->time++;
+      }
+   } 
+}
+/**********************************************************************/
+void check_patients_realloc(struct Village *village)
+{
+   struct Patient *p, *s;
+
+   while (village->hosp.realloc != NULL) 
+   {
+      p = s = village->hosp.realloc;
+      while (p != NULL)
+      {
+         if (p->id < s->id) s = p;
+         p = p->forward;
+      }
+      removeList(&(village->hosp.realloc), s);
+      put_in_hosp(&(village->hosp), s);
+   }
+}
+/**********************************************************************/
+void check_patients_population(struct Village *village) 
+{
+   struct Patient *list = village->population;
+   struct Patient *p;
+   float rand;
+  
+   while (list != NULL) 
+   {
+      p = list;
+      list = list->forward; 
+      /* randomize in patient */
+      rand = my_rand(&(p->seed));
+      if (rand < sim_get_sick_p) 
+      {
+         removeList(&(village->population), p);
+         put_in_hosp(&(village->hosp), p);
+      }
+   }
+
+}
+/**********************************************************************/
+void put_in_hosp(struct Hosp *hosp, struct Patient *patient) 
+{  
+   (patient->hosps_visited)++;
+
+   if (hosp->free_personnel > 0) 
+   {
+      hosp->free_personnel--;
+      addList(&(hosp->assess), patient); 
+      patient->time_left = sim_assess_time;
+      patient->time += patient->time_left;
+   } 
+   else 
+   {
+      addList(&(hosp->waiting), patient); 
+   }
+}
+/**********************************************************************/
+#if defined (IF_CUTOFF)
+void sim_village_par(struct Village *village)
+{
+   struct Village *vlist;
+
+   // lowest level returns nothing
+   // only for sim_village first call with village = NULL
+   // recursive call cannot occurs
+   if (village == NULL) return;
+
+   /* Traverse village hierarchy (lower level first)*/
+   vlist = village->forward;
+   while(vlist)
+   {
+#pragma omp task untied if((sim_level - village->level) < bots_cutoff_value)
+      sim_village_par(vlist);
+      vlist = vlist->next;
+   }
+
+   /* Uses lists v->hosp->inside, and v->return */
+   check_patients_inside(village);
+
+   /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */
+   check_patients_assess_par(village);
+
+   /* Uses lists v->hosp->waiting, and v->hosp->assess */
+   check_patients_waiting(village);
+
+#pragma omp taskwait
+
+   /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */
+   check_patients_realloc(village);
+
+   /* Uses list v->population, v->hosp->asses and v->h->waiting */
+   check_patients_population(village);
+}
+#elif defined (MANUAL_CUTOFF)
+void sim_village_par(struct Village *village)
+{
+   struct Village *vlist;
+
+   // lowest level returns nothing
+   // only for sim_village first call with village = NULL
+   // recursive call cannot occurs
+   if (village == NULL) return;
+
+   /* Traverse village hierarchy (lower level first)*/
+   vlist = village->forward;
+   if ((sim_level-village->level) < bots_cutoff_value)
+   {
+      while(vlist)
+      {
+#pragma omp task untied
+         sim_village_par(vlist);
+         vlist = vlist->next;
+      }
+   }
+   else
+   {
+      while(vlist)
+      {
+         sim_village_par(vlist);
+         vlist = vlist->next;
+      }
+   }
+
+   /* Uses lists v->hosp->inside, and v->return */
+   check_patients_inside(village);
+
+   /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */
+   check_patients_assess_par(village);
+
+   /* Uses lists v->hosp->waiting, and v->hosp->assess */
+   check_patients_waiting(village);
+
+   if ((sim_level-village->level) < bots_cutoff_value)
+   {
+#pragma omp taskwait
+   }
+
+   /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */
+   check_patients_realloc(village);
+
+   /* Uses list v->population, v->hosp->asses and v->h->waiting */
+   check_patients_population(village);
+}
+#else
+void sim_village_par(struct Village *village)
+{
+   struct Village *vlist;
+
+   // lowest level returns nothing
+   // only for sim_village first call with village = NULL
+   // recursive call cannot occurs
+   if (village == NULL) return;
+
+   /* Traverse village hierarchy (lower level first)*/
+   vlist = village->forward;
+   while(vlist)
+   {
+#pragma omp task untied
+      sim_village_par(vlist);
+      vlist = vlist->next;
+   }
+
+   /* Uses lists v->hosp->inside, and v->return */
+   check_patients_inside(village);
+
+   /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */
+   check_patients_assess_par(village);
+
+   /* Uses lists v->hosp->waiting, and v->hosp->assess */
+   check_patients_waiting(village);
+
+#pragma omp taskwait
+
+   /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */
+   check_patients_realloc(village);
+
+   /* Uses list v->population, v->hosp->asses and v->h->waiting */
+   check_patients_population(village);
+}
+#endif
+/**********************************************************************/
+void my_print(struct Village *village)
+{
+   struct Village *vlist;
+   struct Patient *plist;
+
+   if (village == NULL) return;
+
+   /* Traverse village hierarchy (lower level first)*/
+   vlist = village->forward;
+   while(vlist) {
+      my_print(vlist);
+      vlist = vlist->next;
+   }
+
+   plist = village->population;
+
+   while (plist != NULL) {
+      bots_debug("[pid:%d]",plist->id);
+      plist = plist->forward; 
+   }
+   bots_debug("[vid:%d]\n",village->id);
+
+}
+/**********************************************************************/
+void read_input_data(char *filename)
+{
+   FILE *fin;
+   int res;
+
+   if ((fin = fopen(filename, "r")) == NULL) {
+      bots_message("Could not open sequence file (%s)\n", filename);
+      exit (-1);
+   }
+   res = fscanf(fin,"%d %d %d %d %d %d %ld %f %f %f %d %d %d %d %d %d %d %d %f", 
+             &sim_level,
+             &sim_cities,
+             &sim_population_ratio,
+             &sim_time, 
+             &sim_assess_time,
+             &sim_convalescence_time,
+             &sim_seed, 
+             &sim_get_sick_p,
+             &sim_convalescence_p,
+             &sim_realloc_p,
+             &res_population,
+             &res_hospitals,
+             &res_personnel,
+             &res_checkin,
+             &res_village,
+             &res_waiting,
+             &res_assess,
+             &res_inside,
+             &res_avg_stay
+   );
+   if ( res == EOF ) {
+      bots_message("Bogus input file (%s)\n", filename);
+      exit(-1);
+   }
+   fclose(fin);
+
+      // Printing input data
+   bots_message("\n");
+   bots_message("Number of levels    = %d\n", (int) sim_level);
+   bots_message("Cities per level    = %d\n", (int) sim_cities);
+   bots_message("Population ratio    = %d\n", (int) sim_population_ratio);
+   bots_message("Simulation time     = %d\n", (int) sim_time);
+   bots_message("Assess time         = %d\n", (int) sim_assess_time);
+   bots_message("Convalescence time  = %d\n", (int) sim_convalescence_time);
+   bots_message("Initial seed        = %d\n", (int) sim_seed);
+   bots_message("Get sick prob.      = %f\n", (float) sim_get_sick_p);
+   bots_message("Convalescence prob. = %f\n", (float) sim_convalescence_p);
+   bots_message("Realloc prob.       = %f\n", (float) sim_realloc_p);
+}
+int check_village(struct Village *top)
+{
+   struct Results result = get_results(top);
+   int answer = BOTS_RESULT_SUCCESSFUL;
+
+   if (res_population != result.total_patients) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_hospitals != result.hosps_number) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_personnel != result.hosps_personnel) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_checkin != result.total_hosps_v) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_village != result.total_in_village) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_waiting != result.total_waiting) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_assess != result.total_assess) answer = BOTS_RESULT_UNSUCCESSFUL;
+   if (res_inside != result.total_inside) answer = BOTS_RESULT_UNSUCCESSFUL;
+
+   bots_message("\n");
+   bots_message("Sim. Variables      = expect / result\n");
+   bots_message("Total population    = %6d / %6d people\n", (int)   res_population, (int) result.total_patients);
+   bots_message("Hospitals           = %6d / %6d people\n", (int)   res_hospitals, (int) result.hosps_number);
+   bots_message("Personnel           = %6d / %6d people\n", (int)   res_personnel, (int) result.hosps_personnel);
+   bots_message("Check-in's          = %6d / %6d people\n", (int)   res_checkin, (int) result.total_hosps_v);
+   bots_message("In Villages         = %6d / %6d people\n", (int)   res_village, (int) result.total_in_village);
+   bots_message("In Waiting List     = %6d / %6d people\n", (int)   res_waiting, (int) result.total_waiting);
+   bots_message("In Assess           = %6d / %6d people\n", (int)   res_assess, (int) result.total_assess);
+   bots_message("Inside Hospital     = %6d / %6d people\n", (int)   res_inside, (int) result.total_inside);
+   bots_message("Average Stay        = %6f / %6f u/time\n", (float) res_avg_stay,(float) result.total_time/result.total_patients);
+
+   my_print(top);
+
+   return answer;
+}
+/**********************************************************************/
+void sim_village_main_par(struct Village *top)
+{
+   long i;
+   for (i = 0; i < sim_time; i++) sim_village_par(top);   
+}
+
diff --git a/ompss/health/health.h b/ompss/health/health.h
new file mode 100644
index 0000000..4fc293c
--- /dev/null
+++ b/ompss/health/health.h
@@ -0,0 +1,106 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef _HEALTH_H
+#define _HEALTH_H
+/* random defines */
+#define IA 16807
+#define IM 2147483647
+#define AM (1.0 / IM)
+#define IQ 127773
+#define IR 2836
+#define MASK 123459876
+
+struct Results {
+   long hosps_number;
+   long hosps_personnel;
+   long total_patients;
+   long total_in_village;
+   long total_waiting;
+   long total_assess;
+   long total_inside;
+   long total_time;
+   long total_hosps_v;
+};
+
+extern int sim_level;
+
+struct Patient {
+   int id;
+   int32_t seed;
+   int time;
+   int time_left;
+   int hosps_visited;
+   struct Village *home_village;
+   struct Patient *back;
+   struct Patient *forward;
+};
+struct Hosp {
+   int personnel;
+   int free_personnel;
+   struct Patient *waiting;
+   struct Patient *assess;
+   struct Patient *inside;
+   struct Patient *realloc;
+   omp_lock_t  realloc_lock;
+};
+struct Village {
+   int id;
+   struct Village *back;
+   struct Village *next;
+   struct Village *forward;
+   struct Patient *population;
+   struct Hosp hosp;
+   int level;
+   int32_t  seed;
+};
+
+float my_rand(int32_t *seed);
+
+struct Patient *generate_patient(struct Village *village);
+void put_in_hosp(struct Hosp *hosp, struct Patient *patient);
+
+void addList(struct Patient **list, struct Patient *patient);
+void removeList(struct Patient **list, struct Patient *patient);
+
+void check_patients_inside(struct Village *village);
+void check_patients_waiting(struct Village *village);
+void check_patients_realloc(struct Village *village);
+
+void check_patients_assess_par(struct Village *village);
+
+float get_num_people(struct Village *village);
+float get_total_time(struct Village *village);
+float get_total_hosps(struct Village *village);
+
+struct Results get_results(struct Village *village);
+
+void read_input_data(char *filename);
+void allocate_village( struct Village **capital, struct Village *back, struct Village *next, int level, int32_t vid);
+void sim_village_main_par(struct Village *top);
+
+void sim_village_par(struct Village *village);
+int check_village(struct Village *top);
+
+void check_patients_assess(struct Village *village);
+void check_patients_population(struct Village *village);
+void sim_village(struct Village *village);
+void my_print(struct Village *village);
+
+#endif
diff --git a/ompss/nqueens/Makefile b/ompss/nqueens/Makefile
new file mode 100644
index 0000000..c26481b
--- /dev/null
+++ b/ompss/nqueens/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#LIBS = 
+#PROGRAM_OBJS=
+
+CUTOFF_VERSIONS = manual if_clause final 
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/nqueens/app-desc.h b/ompss/nqueens/app-desc.h
new file mode 100644
index 0000000..60c7b99
--- /dev/null
+++ b/ompss/nqueens/app-desc.h
@@ -0,0 +1,43 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+
+#define BOTS_APP_NAME "N Queens"
+#define BOTS_APP_PARAMETERS_DESC "N=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 14
+#define BOTS_APP_DESC_ARG_SIZE "Board size"
+
+int ok(int n, char *a);
+
+void nqueens(int n, int j, char *a, int *solutions, int depth);
+
+void nqueens_ser (int n, int j, char *a, int *solutions);
+
+int verify_queens(int);
+void find_queens (int);
+
+#define KERNEL_CALL find_queens(bots_arg_size)
+#define KERNEL_CHECK verify_queens(bots_arg_size)
+
+#define BOTS_CUTOFF_DEF_VALUE 3
diff --git a/ompss/nqueens/nqueens.c b/ompss/nqueens/nqueens.c
new file mode 100644
index 0000000..471cfeb
--- /dev/null
+++ b/ompss/nqueens/nqueens.c
@@ -0,0 +1,290 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/*
+ * Original code from the Cilk project (by Keith Randall)
+ * 
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <alloca.h>
+#include "bots.h"
+#include "app-desc.h"
+#include <omp.h>
+
+
+/* Checking information */
+
+static int solutions[] = {
+        1,
+        0,
+        0,
+        2,
+        10, /* 5 */
+        4,
+        40,
+        92,
+        352,
+        724, /* 10 */
+        2680,
+        14200,
+        73712,
+        365596,
+};
+#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int)
+
+int total_count;
+
+/*
+ * <a> contains array of <n> queen positions.  Returns 1
+ * if none of the queens conflict, and returns 0 otherwise.
+ */
+int ok(int n, char *a)
+{
+     int i, j;
+     char p, q;
+
+     for (i = 0; i < n; i++) {
+	  p = a[i];
+
+	  for (j = i + 1; j < n; j++) {
+	       q = a[j];
+	       if (q == p || q == p - (j - i) || q == p + (j - i))
+		    return 0;
+	  }
+     }
+     return 1;
+}
+
+void nqueens_ser (int n, int j, char *a, int *solutions)
+{
+	int res;
+	int i;
+
+	if (n == j) {
+		/* good solution, count it */
+		*solutions = 1;
+		return;
+	}
+
+	*solutions = 0;
+
+     	/* try each possible position for queen <j> */
+	for (i = 0; i < n; i++) {
+		{
+	  		/* allocate a temporary array and copy <a> into it */
+	  		a[j] = (char) i;
+	  		if (ok(j + 1, a)) {
+	       			nqueens_ser(n, j + 1, a,&res);
+				*solutions += res;
+			}
+		}
+	}
+}
+
+#if defined(IF_CUTOFF)
+
+void nqueens(int n, int j, char *a, int *solutions, int depth)
+{
+	int *csols;
+	int i;
+
+	if (n == j) {
+		/* good solution, count it */
+		*solutions = 1;
+		return;
+	}
+
+
+	*solutions = 0;
+	csols = alloca(n*sizeof(int));
+	memset(csols,0,n*sizeof(int));
+
+     	/* try each possible position for queen <j> */
+	for (i = 0; i < n; i++) {
+ 		#pragma omp task untied if(depth < bots_cutoff_value)
+		{
+	  		/* allocate a temporary array and copy <a> into it */
+	  		char * b = alloca(n * sizeof(char));
+	  		memcpy(b, a, j * sizeof(char));
+	  		b[j] = (char) i;
+	  		if (ok(j + 1, b))
+	       			nqueens(n, j + 1, b,&csols[i],depth+1);
+		}
+	}
+
+	#pragma omp taskwait
+	for ( i = 0; i < n; i++) *solutions += csols[i];
+}
+
+#elif defined(FINAL_CUTOFF)
+
+void nqueens(int n, int j, char *a, int *solutions, int depth)
+{
+	int *csols;
+	int i;
+
+
+	if (n == j) {
+		/* good solution, count it */
+		*solutions += 1;
+		return;
+	}
+
+
+        char final = omp_in_final();
+        if ( !final ) {
+	  *solutions = 0;
+	  csols = alloca(n*sizeof(int));
+	  memset(csols,0,n*sizeof(int));
+        }
+
+     	/* try each possible position for queen <j> */
+	for (i = 0; i < n; i++) {
+ 		#pragma omp task untied final(depth+1 >= bots_cutoff_value) mergeable
+		{
+                        char *b;
+                        int *sol;
+			if ( omp_in_final() && depth+1 > bots_cutoff_value ) {
+		           b = a;
+                           sol = solutions;
+                        } else {
+	  		/* allocate a temporary array and copy <a> into it */
+	  		   b = alloca(n * sizeof(char));
+	  		   memcpy(b, a, j * sizeof(char));
+                           sol = &csols[i];
+                        } 
+	  		b[j] = i;
+	  		if (ok(j + 1, b))
+	       			nqueens(n, j + 1, b,sol,depth+1);
+		}
+	}
+
+	#pragma omp taskwait
+       if ( !final ) {
+	for ( i = 0; i < n; i++) *solutions += csols[i];
+       }
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+void nqueens(int n, int j, char *a, int *solutions, int depth)
+{
+	int *csols;
+	int i;
+
+
+	if (n == j) {
+		/* good solution, count it */
+		*solutions = 1;
+		return;
+	}
+
+
+	*solutions = 0;
+	csols = alloca(n*sizeof(int));
+	memset(csols,0,n*sizeof(int));
+
+     	/* try each possible position for queen <j> */
+	for (i = 0; i < n; i++) {
+		if ( depth < bots_cutoff_value ) {
+ 			#pragma omp task untied
+			{
+	  			/* allocate a temporary array and copy <a> into it */
+	  			char * b = alloca(n * sizeof(char));
+	  			memcpy(b, a, j * sizeof(char));
+	  			b[j] = (char) i;
+	  			if (ok(j + 1, b))
+	       				nqueens(n, j + 1, b,&csols[i],depth+1);
+			}
+		} else {
+  			a[j] = (char) i;
+  			if (ok(j + 1, a))
+       				nqueens_ser(n, j + 1, a,&csols[i]);
+		}
+	}
+
+	#pragma omp taskwait
+	for ( i = 0; i < n; i++) *solutions += csols[i];
+}
+
+
+#else 
+
+void nqueens(int n, int j, char *a, int *solutions, int depth)
+{
+	int *csols;
+	int i;
+
+
+	if (n == j) {
+		/* good solution, count it */
+		*solutions = 1;
+		return;
+	}
+
+	*solutions = 0;
+	csols = alloca(n*sizeof(int));
+	memset(csols,0,n*sizeof(int));
+
+     	/* try each possible position for queen <j> */
+	for (i = 0; i < n; i++) {
+ 		#pragma omp task untied
+		{
+	  		/* allocate a temporary array and copy <a> into it */
+	  		char * b = alloca(n * sizeof(char));
+	  		memcpy(b, a, j * sizeof(char));
+	  		b[j] = (char) i;
+	  		if (ok(j + 1, b))
+       				nqueens(n, j + 1, b,&csols[i],depth); //FIXME: depth or depth+1 ???
+		}
+	}
+
+	#pragma omp taskwait
+	for ( i = 0; i < n; i++) *solutions += csols[i];
+}
+
+#endif
+
+void find_queens (int size)
+{
+	total_count=0;
+
+        bots_message("Computing N-Queens algorithm (n=%d) ", size);
+
+			char *a;
+
+			a = alloca(size * sizeof(char));
+			nqueens(size, 0, a, &total_count,0);
+
+	bots_message(" completed!\n");
+}
+
+
+int verify_queens (int size)
+{
+	if ( size > MAX_SOLUTIONS ) return BOTS_RESULT_NA;
+	if ( total_count == solutions[size-1]) return BOTS_RESULT_SUCCESSFUL;
+	return BOTS_RESULT_UNSUCCESSFUL;
+}
diff --git a/ompss/sort/Makefile b/ompss/sort/Makefile
new file mode 100644
index 0000000..2e5be6a
--- /dev/null
+++ b/ompss/sort/Makefile
@@ -0,0 +1,35 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#LIBS = 
+#PROGRAM_OBJS=
+
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/sort/app-desc.h b/ompss/sort/app-desc.h
new file mode 100644
index 0000000..bea28f8
--- /dev/null
+++ b/ompss/sort/app-desc.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+
+#define BOTS_APP_NAME "Sort"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Q=%d:I=%d:M=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value_1,bots_app_cutoff_value_2,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE (32*1024*1024)
+#define BOTS_APP_DESC_ARG_SIZE "Array size"
+
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF "Sequential Merge cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_1
+#define BOTS_APP_DEF_ARG_CUTOFF_1 (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF_1 "Sequential Quicksort cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_2
+#define BOTS_APP_DEF_ARG_CUTOFF_2 (20)
+#define BOTS_APP_DESC_ARG_CUTOFF_2 "Sequential Insertion cutoff value"
+
+typedef long ELM;
+
+void seqquick(ELM *low, ELM *high); 
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+ELM *binsplit(ELM val, ELM *low, ELM *high); 
+void cilkmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilksort(ELM *low, ELM *tmp, long size);
+void cilksort_par(ELM *low, ELM *tmp, long size);
+void scramble_array( ELM *array ); 
+void fill_array( ELM *array ); 
+void sort ( void ); 
+
+void sort_par (void);
+void sort_init (void);
+int sort_verify (void);
+
+#define BOTS_APP_INIT sort_init()
+
+#define KERNEL_INIT
+#define KERNEL_CALL sort_par()
+#define KERNEL_CHECK sort_verify()
+
+
diff --git a/ompss/sort/sort.c b/ompss/sort/sort.c
new file mode 100644
index 0000000..9109afd
--- /dev/null
+++ b/ompss/sort/sort.c
@@ -0,0 +1,485 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/*
+ *  Original code from the Cilk project
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+/*
+ * this program uses an algorithm that we call `cilksort'.
+ * The algorithm is essentially mergesort:
+ *
+ *   cilksort(in[1..n]) =
+ *       spawn cilksort(in[1..n/2], tmp[1..n/2])
+ *       spawn cilksort(in[n/2..n], tmp[n/2..n])
+ *       sync
+ *       spawn cilkmerge(tmp[1..n/2], tmp[n/2..n], in[1..n])
+ *
+ *
+ * The procedure cilkmerge does the following:
+ *       
+ *       cilkmerge(A[1..n], B[1..m], C[1..(n+m)]) =
+ *          find the median of A \union B using binary
+ *          search.  The binary search gives a pair
+ *          (ma, mb) such that ma + mb = (n + m)/2
+ *          and all elements in A[1..ma] are smaller than
+ *          B[mb..m], and all the B[1..mb] are smaller
+ *          than all elements in A[ma..n].
+ *
+ *          spawn cilkmerge(A[1..ma], B[1..mb], C[1..(n+m)/2])
+ *          spawn cilkmerge(A[ma..m], B[mb..n], C[(n+m)/2 .. (n+m)])
+ *          sync
+ *
+ * The algorithm appears for the first time (AFAIK) in S. G. Akl and
+ * N. Santoro, "Optimal Parallel Merging and Sorting Without Memory
+ * Conflicts", IEEE Trans. Comp., Vol. C-36 No. 11, Nov. 1987 .  The
+ * paper does not express the algorithm using recursion, but the
+ * idea of finding the median is there.
+ *
+ * For cilksort of n elements, T_1 = O(n log n) and
+ * T_\infty = O(log^3 n).  There is a way to shave a
+ * log factor in the critical path (left as homework).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+ELM *array, *tmp;
+
+static unsigned long rand_nxt = 0;
+
+static inline unsigned long my_rand(void)
+{
+     rand_nxt = rand_nxt * 1103515245 + 12345;
+     return rand_nxt;
+}
+
+static inline void my_srand(unsigned long seed)
+{
+     rand_nxt = seed;
+}
+
+static inline ELM med3(ELM a, ELM b, ELM c)
+{
+     if (a < b) {
+	  if (b < c) {
+	       return b;
+	  } else {
+	       if (a < c)
+		    return c;
+	       else
+		    return a;
+	  }
+     } else {
+	  if (b > c) {
+	       return b;
+	  } else {
+	       if (a > c)
+		    return c;
+	       else
+		    return a;
+	  }
+     }
+}
+
+/*
+ * simple approach for now; a better median-finding
+ * may be preferable
+ */
+static inline ELM choose_pivot(ELM *low, ELM *high)
+{
+     return med3(*low, *high, low[(high - low) / 2]);
+}
+
+static ELM *seqpart(ELM *low, ELM *high)
+{
+     ELM pivot;
+     ELM h, l;
+     ELM *curr_low = low;
+     ELM *curr_high = high;
+
+     pivot = choose_pivot(low, high);
+
+     while (1) {
+	  while ((h = *curr_high) > pivot)
+	       curr_high--;
+
+	  while ((l = *curr_low) < pivot)
+	       curr_low++;
+
+	  if (curr_low >= curr_high)
+	       break;
+
+	  *curr_high-- = l;
+	  *curr_low++ = h;
+     }
+
+     /*
+      * I don't know if this is really necessary.
+      * The problem is that the pivot is not always the
+      * first element, and the partition may be trivial.
+      * However, if the partition is trivial, then
+      * *high is the largest element, whence the following
+      * code.
+      */
+     if (curr_high < high)
+	  return curr_high;
+     else
+	  return curr_high - 1;
+}
+
+#define swap(a, b) \
+{ \
+  ELM tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+static void insertion_sort(ELM *low, ELM *high)
+{
+     ELM *p, *q;
+     ELM a, b;
+
+     for (q = low + 1; q <= high; ++q) {
+	  a = q[0];
+	  for (p = q - 1; p >= low && (b = p[0]) > a; p--)
+	       p[1] = b;
+	  p[1] = a;
+     }
+}
+
+/*
+ * tail-recursive quicksort, almost unrecognizable :-)
+ */
+void seqquick(ELM *low, ELM *high)
+{
+     ELM *p;
+
+     while (high - low >= bots_app_cutoff_value_2) {
+	  p = seqpart(low, high);
+	  seqquick(low, p);
+	  low = p + 1;
+     }
+
+     insertion_sort(low, high);
+}
+
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2,
+	      ELM *lowdest)
+{
+     ELM a1, a2;
+
+     /*
+      * The following 'if' statement is not necessary
+      * for the correctness of the algorithm, and is
+      * in fact subsumed by the rest of the function.
+      * However, it is a few percent faster.  Here is why.
+      *
+      * The merging loop below has something like
+      *   if (a1 < a2) {
+      *        *dest++ = a1;
+      *        ++low1;
+      *        if (end of array) break;
+      *        a1 = *low1;
+      *   }
+      *
+      * Now, a1 is needed immediately in the next iteration
+      * and there is no way to mask the latency of the load.
+      * A better approach is to load a1 *before* the end-of-array
+      * check; the problem is that we may be speculatively
+      * loading an element out of range.  While this is
+      * probably not a problem in practice, yet I don't feel
+      * comfortable with an incorrect algorithm.  Therefore,
+      * I use the 'fast' loop on the array (except for the last 
+      * element) and the 'slow' loop for the rest, saving both
+      * performance and correctness.
+      */
+
+     if (low1 < high1 && low2 < high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    a1 = *++low1;
+		    if (low1 >= high1)
+			 break;
+	       } else {
+		    *lowdest++ = a2;
+		    a2 = *++low2;
+		    if (low2 >= high2)
+			 break;
+	       }
+	  }
+     }
+     if (low1 <= high1 && low2 <= high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    ++low1;
+		    if (low1 > high1)
+			 break;
+		    a1 = *low1;
+	       } else {
+		    *lowdest++ = a2;
+		    ++low2;
+		    if (low2 > high2)
+			 break;
+		    a2 = *low2;
+	       }
+	  }
+     }
+     if (low1 > high1) {
+	  memcpy(lowdest, low2, sizeof(ELM) * (high2 - low2 + 1));
+     } else {
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1 + 1));
+     }
+}
+
+#define swap_indices(a, b) \
+{ \
+  ELM *tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+ELM *binsplit(ELM val, ELM *low, ELM *high)
+{
+     /*
+      * returns index which contains greatest element <= val.  If val is
+      * less than all elements, returns low-1
+      */
+     ELM *mid;
+
+     while (low != high) {
+	  mid = low + ((high - low + 1) >> 1);
+	  if (val <= *mid)
+	       high = mid - 1;
+	  else
+	       low = mid;
+     }
+
+     if (*low > val)
+	  return low - 1;
+     else
+	  return low;
+}
+
+
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest)
+{
+     /*
+      * Cilkmerge: Merges range [low1, high1] with range [low2, high2] 
+      * into the range [lowdest, ...]  
+      */
+
+     ELM *split1, *split2;	/*
+				 * where each of the ranges are broken for 
+				 * recursive merge 
+				 */
+     long int lowsize;		/*
+				 * total size of lower halves of two
+				 * ranges - 2 
+				 */
+
+     /*
+      * We want to take the middle element (indexed by split1) from the
+      * larger of the two arrays.  The following code assumes that split1
+      * is taken from range [low1, high1].  So if [low1, high1] is
+      * actually the smaller range, we should swap it with [low2, high2] 
+      */
+
+     if (high2 - low2 > high1 - low1) {
+	  swap_indices(low1, low2);
+	  swap_indices(high1, high2);
+     }
+     if (high2 < low2) {
+	  /* smaller range is empty */
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1));
+	  return;
+     }
+     if (high2 - low2 < bots_app_cutoff_value ) {
+	  seqmerge(low1, high1, low2, high2, lowdest);
+	  return;
+     }
+     /*
+      * Basic approach: Find the middle element of one range (indexed by
+      * split1). Find where this element would fit in the other range
+      * (indexed by split 2). Then merge the two lower halves and the two
+      * upper halves. 
+      */
+
+     split1 = ((high1 - low1 + 1) / 2) + low1;
+     split2 = binsplit(*split1, low2, high2);
+     lowsize = split1 - low1 + split2 - low2;
+
+     /* 
+      * directly put the splitting element into
+      * the appropriate location
+      */
+     *(lowdest + lowsize + 1) = *split1;
+#pragma omp task untied
+     cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
+#pragma omp task untied
+     cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
+		     lowdest + lowsize + 2);
+#pragma omp taskwait
+
+     return;
+}
+
+void cilksort_par(ELM *low, ELM *tmp, long size)
+{
+     /*
+      * divide the input in four parts of the same size (A, B, C, D)
+      * Then:
+      *   1) recursively sort A, B, C, and D (in parallel)
+      *   2) merge A and B into tmp1, and C and D into tmp2 (in parallel)
+      *   3) merge tmp1 and tmp2 into the original array
+      */
+     long quarter = size / 4;
+     ELM *A, *B, *C, *D, *tmpA, *tmpB, *tmpC, *tmpD;
+
+     if (size < bots_app_cutoff_value_1 ) {
+	  /* quicksort when less than 1024 elements */
+	  seqquick(low, low + size - 1);
+	  return;
+     }
+     A = low;
+     tmpA = tmp;
+     B = A + quarter;
+     tmpB = tmpA + quarter;
+     C = B + quarter;
+     tmpC = tmpB + quarter;
+     D = C + quarter;
+     tmpD = tmpC + quarter;
+
+#pragma omp task untied
+     cilksort_par(A, tmpA, quarter);
+#pragma omp task untied
+     cilksort_par(B, tmpB, quarter);
+#pragma omp task untied
+     cilksort_par(C, tmpC, quarter);
+#pragma omp task untied
+     cilksort_par(D, tmpD, size - 3 * quarter);
+#pragma omp taskwait
+
+#pragma omp task untied
+     cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA);
+#pragma omp task untied
+     cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC);
+#pragma omp taskwait
+
+     cilkmerge_par(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A);
+}
+
+void scramble_array( ELM *array )
+{
+     unsigned long i;
+     unsigned long j;
+
+     for (i = 0; i < bots_arg_size; ++i) {
+	  j = my_rand();
+	  j = j % bots_arg_size;
+	  swap(array[i], array[j]);
+     }
+}
+
+void fill_array( ELM *array )
+{
+     unsigned long i;
+
+     my_srand(1);
+     /* first, fill with integers 1..size */
+     for (i = 0; i < bots_arg_size; ++i) {
+	  array[i] = i;
+     }
+}
+
+void sort_init ( void )
+{
+     /* Checking arguments */
+     if (bots_arg_size < 4) {
+        bots_message("%s can not be less than 4, using 4 as a parameter.\n", BOTS_APP_DESC_ARG_SIZE );
+        bots_arg_size = 4;
+     }
+
+     if (bots_app_cutoff_value < 2) {
+        bots_message("%s can not be less than 2, using 2 as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF);
+        bots_app_cutoff_value = 2;
+     }
+     else if (bots_app_cutoff_value > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF, bots_arg_size);
+        bots_app_cutoff_value = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_1 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_1, bots_arg_size);
+        bots_app_cutoff_value_1 = bots_arg_size;
+     }
+     if (bots_app_cutoff_value_2 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_2, bots_arg_size);
+        bots_app_cutoff_value_2 = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_2 > bots_app_cutoff_value_1) {
+        bots_message("%s can not be greather than %s, using %d as a parameter.\n",
+		BOTS_APP_DESC_ARG_CUTOFF_2,
+		BOTS_APP_DESC_ARG_CUTOFF_1,
+		bots_app_cutoff_value_1
+	);
+        bots_app_cutoff_value_2 = bots_app_cutoff_value_1;
+     }
+
+     array = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     tmp = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     fill_array(array);
+     scramble_array(array);
+}
+
+void sort_par ( void )
+{
+	bots_message("Computing multisort algorithm (n=%d) - It might take a while...\n", bots_arg_size);
+
+	cilksort_par(array, tmp, bots_arg_size);
+
+	bots_message("Multisort execution has finished\n");
+}
+
+int sort_verify ( void )
+{
+   int i, success = 1;
+   for (i = 0; i < bots_arg_size; ++i)
+      if (array[i] != i)
+         success = 0;
+
+   return success ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/ompss/strassen/Makefile b/ompss/strassen/Makefile
new file mode 100644
index 0000000..a4923b1
--- /dev/null
+++ b/ompss/strassen/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+#LIBS = 
+#PROGRAM_OBJS=
+
+CUTOFF_VERSIONS = manual if_clause
+TIED_VERSIONS = yes
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/strassen/app-desc.h b/ompss/strassen/app-desc.h
new file mode 100644
index 0000000..267cc9a
--- /dev/null
+++ b/ompss/strassen/app-desc.h
@@ -0,0 +1,77 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "ompss-app.h"
+
+#define BOTS_APP_NAME "Strassen"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Y=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_BLOCK
+#define BOTS_APP_DEF_ARG_BLOCK 32
+#define BOTS_APP_DESC_ARG_BLOCK "Matrix Block Size"
+
+/* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF 64
+#define BOTS_APP_DESC_ARG_CUTOFF "Strassen Cutoff"
+
+/* Task creation cut off */
+#define BOTS_CUTOFF_DEF_VALUE 3
+
+/***********************************************************************
+ * The real numbers we are using --- either double or float
+ **********************************************************************/
+typedef double REAL;
+typedef unsigned long PTR;
+void init_matrix(int n, REAL *A, int an);
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n);
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n);
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn);
+
+#define BOTS_APP_INIT\
+    double *A, *B, *C, *D;\
+    if ((bots_arg_size & (bots_arg_size - 1)) != 0 || (bots_arg_size % 16) != 0) {\
+        bots_message("Error: matrix size (%d) must be a power of 2 and a multiple of %d\n", bots_arg_size, 16);\
+        exit (1);\
+    }\
+    A = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    B = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    C = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    D = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    init_matrix(bots_arg_size,A,bots_arg_size);\
+    init_matrix(bots_arg_size,B,bots_arg_size);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL strassen_main_par(C,A,B,bots_arg_size);
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL strassen_main_seq(D,A,B,bots_arg_size);
+//#define KERNEL_SEQ_FINI
+
+#define BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK compare_matrix(bots_arg_size,C,bots_arg_size,D,bots_arg_size);
+
+
diff --git a/ompss/strassen/strassen.c b/ompss/strassen/strassen.c
new file mode 100644
index 0000000..27da6f6
--- /dev/null
+++ b/ompss/strassen/strassen.c
@@ -0,0 +1,1279 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/**********************************************************************************************/
+
+/*
+ * Copyright (c) 1996 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use, copy, modify, and distribute the Software without
+ * restriction, provided the Software, including any modified copies made
+ * under this license, is not distributed for a fee, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
+ * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * /WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the Massachusetts
+ * Institute of Technology shall not be used in advertising or otherwise
+ * to promote the sale, use or other dealings in this Software without
+ * prior written authorization from the Massachusetts Institute of
+ * Technology.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "app-desc.h"
+#include "bots.h"
+#include "strassen.h"
+
+/***********************************************************************
+ * Naive sequential algorithm, for comparison purposes
+ **********************************************************************/
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn)
+{
+   int i, j, k;
+   REAL s;
+
+   for (i = 0; i < n; ++i)
+   { 
+      for (j = 0; j < n; ++j)
+      {
+         s = 0.0;
+         for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
+         ELEM(C, cn, i, j) = s;
+      }
+   }
+}
+/*****************************************************************************
+**
+** FastNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C = A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+      REAL FirstARowValue = *ARowStart++;
+
+      REAL Sum0 = FirstARowValue * (*BColumnStart);
+      REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
+      REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
+      REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
+      REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
+      REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
+      REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
+      REAL Sum7 = FirstARowValue * (*(BColumnStart+7));	
+
+      unsigned Products;
+      for (Products = 1; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));	
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** FastAdditiveNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C += A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C READ/WRITE) Matrix C contains C + A x B.
+**
+*****************************************************************************/
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+
+      REAL Sum0 = *C;
+      REAL Sum1 = *(C+1);
+      REAL Sum2 = *(C+2);
+      REAL Sum3 = *(C+3);
+      REAL Sum4 = *(C+4);
+      REAL Sum5 = *(C+5);
+      REAL Sum6 = *(C+6);
+      REAL Sum7 = *(C+7);	
+
+      unsigned Products;
+      for (Products = 0; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));
+
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** MultiplyByDivideAndConquer
+**
+** For medium to medium-large (would you like fries with that) sized
+** matrices A, B, and C of size MatrixSize * MatrixSize this function
+** efficiently performs the operation
+**    C  = A x B (if AdditiveMode == 0)
+**    C += A x B (if AdditiveMode != 0)
+**
+** Note MatrixSize must be divisible by 16.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
+**
+** OUTPUT:
+**    C (+)= A x B. (+ if AdditiveMode != 0)
+**
+*****************************************************************************/
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    )
+{
+  #define A00 A
+  #define B00 B
+  #define C00 C
+  REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
+  unsigned QuadrantSize = MatrixSize >> 1;
+
+  /* partition the matrix */
+  A01 = A00 + QuadrantSize;
+  A10 = A00 + RowWidthA * QuadrantSize;
+  A11 = A10 + QuadrantSize;
+
+  B01 = B00 + QuadrantSize;
+  B10 = B00 + RowWidthB * QuadrantSize;
+  B11 = B10 + QuadrantSize;
+
+  C01 = C00 + QuadrantSize;
+  C10 = C00 + RowWidthC * QuadrantSize;
+  C11 = C10 + QuadrantSize;
+
+  if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
+
+    MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+    
+  } else {
+
+    if (AdditiveMode) {
+      FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+    } else {
+      
+      FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+    }
+
+    FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+  }
+  return;
+}
+/*****************************************************************************
+**
+** OptimizedStrassenMultiply
+**
+** For large matrices A, B, and C of size MatrixSize * MatrixSize this
+** function performs the operation C = A x B efficiently.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  /* M2 = A11 x B11 */
+  OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  OptimizedStrassenMultiply_seq(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#if defined(IF_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  /* M2 = A11 x B11 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#elif defined(MANUAL_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  if (Depth < bots_cutoff_value)
+  {
+    /* M2 = A11 x B11 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+    /* M5 = S1 * S5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+    /**********************************************
+    ** Synchronization Point
+    **********************************************/
+    #pragma omp taskwait
+  }
+  else
+  {
+    /* M2 = A11 x B11 */
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+    /* M5 = S1 * S5 */
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+  }
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#else
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  /* M2 = A11 x B11 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#endif
+/*
+ * Set an n by n matrix A to random values.  The distance between
+ * rows is an
+ */
+void init_matrix(int n, REAL *A, int an)
+{
+     int i, j;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) 
+	       ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; 
+}
+
+/*
+ * Compare two matrices.  Print an error message if they differ by
+ * more than EPSILON.
+ */
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn)
+{
+     int i, j;
+     REAL c;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) {
+	       /* compute the relative error c */
+	       c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
+	       if (c < 0.0) 
+		    c = -c;
+
+	       c = c / ELEM(A, an, i, j);
+	       if (c > EPSILON) {
+		    bots_message("Strassen: Wrong answer!\n");
+		    return BOTS_RESULT_UNSUCCESSFUL;
+	       }
+	  }
+
+     return BOTS_RESULT_SUCCESSFUL;
+}
+	       
+/*
+ * Allocate a matrix of side n (therefore n^2 elements)
+ */
+REAL *alloc_matrix(int n) 
+{
+     return malloc(n * n * sizeof(REAL));
+}
+
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n)
+{
+   bots_message("Computing parallel Strassen algorithm (n=%d) ", n);
+   OptimizedStrassenMultiply_par(C, A, B, n, n, n, n, 1);
+   bots_message(" completed!\n");
+}
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n)
+{
+	bots_message("Computing sequential Strassen algorithm (n=%d) ", n);
+	OptimizedStrassenMultiply_seq(C, A, B, n, n, n, n, 1);
+	bots_message(" completed!\n");
+}
+
diff --git a/ompss/strassen/strassen.h b/ompss/strassen/strassen.h
new file mode 100644
index 0000000..7944f77
--- /dev/null
+++ b/ompss/strassen/strassen.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef _STRASSEN_H
+#define _STRASSEN_H
+/* ******************************************************************* */
+/* STRASSEN APPLICATION CUT OFF's                                      */
+/* ******************************************************************* */
+/* Strassen uses three different functions to compute Matrix Multiply. */
+/* Each of them is related to an application cut off value:            */
+/*  - Initial algorithm: OptimizedStrassenMultiply()                   */
+/*  - bots_app_cutoff_value: MultiplyByDivideAndConquer()              */
+/*  - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
+/* ******************************************************************* */
+
+/*FIXME: at the moment we use a constant value, change to parameter ???*/
+/* Below this cut off  strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
+#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
+
+/***********************************************************************
+ * maximum tolerable relative error (for the checking routine)
+ **********************************************************************/
+#define EPSILON (1.0E-6)
+/***********************************************************************
+ * Matrices are stored in row-major order; A is a pointer to
+ * the first element of the matrix, and an is the number of elements
+ * between two rows. This macro produces the element A[i,j]
+ * given A, an, i and j
+ **********************************************************************/
+#define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
+
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn);
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    );
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+REAL *alloc_matrix(int n);
+#endif
+
diff --git a/ompss/uts/Makefile b/ompss/uts/Makefile
new file mode 100644
index 0000000..f464268
--- /dev/null
+++ b/ompss/uts/Makefile
@@ -0,0 +1,36 @@
+##############################################################################################
+#  This program is part of the Barcelona OpenMP Tasks Suite                                  #
+#  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  #
+#  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   #
+#                                                                                            #
+#  This program is free software; you can redistribute it and/or modify                      #
+#  it under the terms of the GNU General Public License as published by                      #
+#  the Free Software Foundation; either version 2 of the License, or                         #
+#  (at your option) any later version.                                                       #
+#                                                                                            #
+#  This program is distributed in the hope that it will be useful,                           #
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of                            #
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             #
+#  GNU General Public License for more details.                                              #
+#                                                                                            #
+#  You should have received a copy of the GNU General Public License                         #
+#  along with this program; if not, write to the Free Software                               #
+#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            #
+##############################################################################################
+
+LIBS = -lm
+PROGRAM_OBJS=uts.o brg_sha1.o
+
+#CUTOFF_VERSIONS = manual if_clause
+TIED_VERSIONS = YES
+
+BASE_DIR = ../../
+
+#
+# Don't change below here 
+#
+
+include ../Makefile.version
+include $(BASE_DIR)/common/Makefile.common
+
+
diff --git a/ompss/uts/app-desc.h b/ompss/uts/app-desc.h
new file mode 100644
index 0000000..9cfe6a6
--- /dev/null
+++ b/ompss/uts/app-desc.h
@@ -0,0 +1,45 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#include "ompss-app.h"
+#include "uts.h"
+
+#define BOTS_APP_NAME "Unbalance Tree Search"
+#define BOTS_APP_PARAMETERS_DESC "%s"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file
+
+#define BOTS_APP_USES_ARG_FILE
+#define BOTS_APP_DEF_ARG_FILE "Input filename"
+#define BOTS_APP_DESC_ARG_FILE "UTS input file (mandatory)"
+
+#define BOTS_APP_INIT \
+  Node root; \
+  uts_read_file(bots_arg_file);
+
+#define KERNEL_INIT uts_initRoot(&root);
+
+unsigned long long parallel_uts ( Node *);
+
+#define KERNEL_CALL bots_number_of_tasks = parallel_uts(&root);
+ 
+#define KERNEL_FINI uts_show_stats();
+
+#define KERNEL_CHECK uts_check_result();
+
+
diff --git a/ompss/uts/brg_endian.h b/ompss/uts/brg_endian.h
new file mode 100644
index 0000000..302112f
--- /dev/null
+++ b/ompss/uts/brg_endian.h
@@ -0,0 +1,141 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/ompss/uts/brg_sha1.c b/ompss/uts/brg_sha1.c
new file mode 100644
index 0000000..bef3d14
--- /dev/null
+++ b/ompss/uts/brg_sha1.c
@@ -0,0 +1,341 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+
+ This is a byte oriented version of SHA1 that operates on arrays of bytes
+ stored in memory.
+*/
+
+#include <string.h>     /* for memcpy() etc.        */
+#include <stdio.h>
+
+#include "brg_sha1.h"
+#include "brg_endian.h"
+#include "bots.h"
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+void rng_init(RNG_state *newstate, int seed)
+{
+  struct sha1_context ctx;
+  struct state_t gen;
+  int i;
+
+  for (i=0; i < 16; i++) 
+    gen.state[i] = 0;
+  gen.state[16] = (u_int8_t) (0xFF & (seed >> 24));
+  gen.state[17] = (u_int8_t) (0xFF & (seed >> 16));
+  gen.state[18] = (u_int8_t) (0xFF & (seed >> 8));
+  gen.state[19] = (u_int8_t) (0xFF & (seed >> 0));
+  
+  sha1_begin(&ctx);
+  sha1_hash(gen.state, 20, &ctx);
+  sha1_end(newstate, &ctx);
+}
+
+void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber)
+{
+	struct sha1_context ctx;
+	u_int8_t  bytes[4];
+	
+	bytes[0] = (u_int8_t) (0xFF & (spawnnumber >> 24));
+	bytes[1] = (u_int8_t) (0xFF & (spawnnumber >> 16));
+	bytes[2] = (u_int8_t) (0xFF & (spawnnumber >> 8));
+	bytes[3] = (u_int8_t) (0xFF & spawnnumber);
+	
+	sha1_begin(&ctx);
+	sha1_hash(mystate, 20, &ctx);
+	sha1_hash(bytes, 4, &ctx);
+	sha1_end(newstate, &ctx);
+}
+
+int rng_rand(RNG_state *mystate){
+        int r;
+	uint32 b =  (mystate[16] << 24) | (mystate[17] << 16)
+		| (mystate[18] << 8) | (mystate[19] << 0);
+	b = b & POS_MASK;
+	
+	r = (int) b;
+	bots_debug("b: %d\t, r: %d\n", b, r);
+	return r;
+}
+
+int rng_nextrand(RNG_state *mystate){
+	struct sha1_context ctx;
+	int r;
+	uint32 b;
+
+	sha1_begin(&ctx);
+	sha1_hash(mystate, 20, &ctx);
+	sha1_end(mystate, &ctx);
+	b =  (mystate[16] << 24) | (mystate[17] << 16)
+		| (mystate[18] << 8) | (mystate[19] << 0);
+	b = b & POS_MASK;
+	
+	r = (int) b;
+	return r;
+}
+
+/* condense state into string to display during debugging */
+char * rng_showstate(RNG_state *state, char *s){
+  sprintf(s,"%.2X%.2X...", state[0],state[1]);
+  return s;
+}
+
+/* describe random number generator type into string */
+void rng_showtype( void ) {
+  bots_message("SHA-1 (state size = %luB)\n", sizeof(struct state_t));
+}
+
+/** END: UTS RNG Harness **/
+
+#if defined( _MSC_VER ) && ( _MSC_VER > 800 )
+#pragma intrinsic(memcpy)
+#endif
+
+#if 0 && defined(_MSC_VER)
+#define rotl32  _lrotl
+#define rotr32  _lrotr
+#else
+#define rotl32(x,n)   (((x) << n) | ((x) >> (32 - n)))
+#define rotr32(x,n)   (((x) >> n) | ((x) << (32 - n)))
+#endif
+
+#if !defined(bswap_32)
+#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
+#endif
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define SWAP_BYTES
+#else
+#undef  SWAP_BYTES
+#endif
+
+#if defined(SWAP_BYTES)
+#define bsw_32(p,n) \
+    { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); }
+#else
+#define bsw_32(p,n)
+#endif
+
+#define SHA1_MASK   (SHA1_BLOCK_SIZE - 1)
+
+#if 0
+
+#define ch(x,y,z)       (((x) & (y)) ^ (~(x) & (z)))
+#define parity(x,y,z)   ((x) ^ (y) ^ (z))
+#define maj(x,y,z)      (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#else   /* Discovered by Rich Schroeppel and Colin Plumb   */
+
+#define ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+#define parity(x,y,z)   ((x) ^ (y) ^ (z))
+#define maj(x,y,z)      (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+#endif
+
+/* Compile 64 bytes of hash data into SHA1 context. Note    */
+/* that this routine assumes that the byte order in the     */
+/* ctx->wbuf[] at this point is in such an order that low   */
+/* address bytes in the ORIGINAL byte stream will go in     */
+/* this buffer to the high end of 32-bit words on BOTH big  */
+/* and little endian systems                                */
+
+#ifdef ARRAY
+#define q(v,n)  v[n]
+#else
+#define q(v,n)  v##n
+#endif
+
+#define one_cycle(v,a,b,c,d,e,f,k,h)            \
+    q(v,e) += rotr32(q(v,a),27) +               \
+              f(q(v,b),q(v,c),q(v,d)) + k + h;  \
+    q(v,b)  = rotr32(q(v,b), 2)
+
+#define five_cycle(v,f,k,i)                 \
+    one_cycle(v, 0,1,2,3,4, f,k,hf(i  ));   \
+    one_cycle(v, 4,0,1,2,3, f,k,hf(i+1));   \
+    one_cycle(v, 3,4,0,1,2, f,k,hf(i+2));   \
+    one_cycle(v, 2,3,4,0,1, f,k,hf(i+3));   \
+    one_cycle(v, 1,2,3,4,0, f,k,hf(i+4))
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1])
+{   uint_32t    *w = ctx->wbuf;
+
+#ifdef ARRAY
+    uint_32t    v[5];
+    memcpy(v, ctx->hash, 5 * sizeof(uint_32t));
+#else
+    uint_32t    v0, v1, v2, v3, v4;
+    v0 = ctx->hash[0]; v1 = ctx->hash[1];
+    v2 = ctx->hash[2]; v3 = ctx->hash[3];
+    v4 = ctx->hash[4];
+#endif
+
+#define hf(i)   w[i]
+
+    five_cycle(v, ch, 0x5a827999,  0);
+    five_cycle(v, ch, 0x5a827999,  5);
+    five_cycle(v, ch, 0x5a827999, 10);
+    one_cycle(v,0,1,2,3,4, ch, 0x5a827999, hf(15)); \
+
+#undef  hf
+#define hf(i) (w[(i) & 15] = rotl32(                    \
+                 w[((i) + 13) & 15] ^ w[((i) + 8) & 15] \
+               ^ w[((i) +  2) & 15] ^ w[(i) & 15], 1))
+
+    one_cycle(v,4,0,1,2,3, ch, 0x5a827999, hf(16));
+    one_cycle(v,3,4,0,1,2, ch, 0x5a827999, hf(17));
+    one_cycle(v,2,3,4,0,1, ch, 0x5a827999, hf(18));
+    one_cycle(v,1,2,3,4,0, ch, 0x5a827999, hf(19));
+
+    five_cycle(v, parity, 0x6ed9eba1,  20);
+    five_cycle(v, parity, 0x6ed9eba1,  25);
+    five_cycle(v, parity, 0x6ed9eba1,  30);
+    five_cycle(v, parity, 0x6ed9eba1,  35);
+
+    five_cycle(v, maj, 0x8f1bbcdc,  40);
+    five_cycle(v, maj, 0x8f1bbcdc,  45);
+    five_cycle(v, maj, 0x8f1bbcdc,  50);
+    five_cycle(v, maj, 0x8f1bbcdc,  55);
+
+    five_cycle(v, parity, 0xca62c1d6,  60);
+    five_cycle(v, parity, 0xca62c1d6,  65);
+    five_cycle(v, parity, 0xca62c1d6,  70);
+    five_cycle(v, parity, 0xca62c1d6,  75);
+
+#ifdef ARRAY
+    ctx->hash[0] += v[0]; ctx->hash[1] += v[1];
+    ctx->hash[2] += v[2]; ctx->hash[3] += v[3];
+    ctx->hash[4] += v[4];
+#else
+    ctx->hash[0] += v0; ctx->hash[1] += v1;
+    ctx->hash[2] += v2; ctx->hash[3] += v3;
+    ctx->hash[4] += v4;
+#endif
+}
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1])
+{
+    ctx->count[0] = ctx->count[1] = 0;
+    ctx->hash[0] = 0x67452301;
+    ctx->hash[1] = 0xefcdab89;
+    ctx->hash[2] = 0x98badcfe;
+    ctx->hash[3] = 0x10325476;
+    ctx->hash[4] = 0xc3d2e1f0;
+}
+
+/* SHA1 hash data in an array of bytes into hash buffer and */
+/* call the hash_compile function as required.              */
+
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1])
+{   uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK),
+            space = SHA1_BLOCK_SIZE - pos;
+    const unsigned char *sp = data;
+
+    if((ctx->count[0] += len) < len)
+        ++(ctx->count[1]);
+
+    while(len >= space)     /* tranfer whole blocks if possible  */
+    {
+        memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space);
+        sp += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0;
+        bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2);
+        sha1_compile(ctx);
+    }
+
+    memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA1 final padding and digest calculation  */
+
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1])
+{   uint_32t    i = (uint_32t)(ctx->count[0] & SHA1_MASK);
+
+    /* put bytes in the buffer in an order in which references to   */
+    /* 32-bit words will put bytes with lower addresses into the    */
+    /* top of 32 bit words on BOTH big and little endian machines   */
+    bsw_32(ctx->wbuf, (i + 3) >> 2);
+
+    /* we now need to mask valid bytes and add the padding which is */
+    /* a single 1 bit and as many zero bits as necessary. Note that */
+    /* we can always add the first padding byte here because the    */
+    /* buffer always has at least one empty slot                    */
+    ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
+    ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
+
+    /* we need 9 or more empty positions, one for the padding byte  */
+    /* (above) and eight for the length count. If there is not      */
+    /* enough space, pad and empty the buffer                       */
+    if(i > SHA1_BLOCK_SIZE - 9)
+    {
+        if(i < 60) ctx->wbuf[15] = 0;
+        sha1_compile(ctx);
+        i = 0;
+    }
+    else    /* compute a word index for the empty buffer positions  */
+        i = (i >> 2) + 1;
+
+    while(i < 14) /* and zero pad all but last two positions        */
+        ctx->wbuf[i++] = 0;
+
+    /* the following 32-bit length fields are assembled in the      */
+    /* wrong byte order on little endian machines but this is       */
+    /* corrected later since they are only ever used as 32-bit      */
+    /* word values.                                                 */
+    ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
+    ctx->wbuf[15] = ctx->count[0] << 3;
+    sha1_compile(ctx);
+
+    /* extract the hash value as bytes in case the hash buffer is   */
+    /* misaligned for 32-bit words                                  */
+    for(i = 0; i < SHA1_DIGEST_SIZE; ++i)
+        hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
+}
+
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len)
+{   sha1_ctx    cx[1];
+
+    sha1_begin(cx); sha1_hash(data, len, cx); sha1_end(hval, cx);
+}
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/ompss/uts/brg_sha1.h b/ompss/uts/brg_sha1.h
new file mode 100644
index 0000000..31e2aa5
--- /dev/null
+++ b/ompss/uts/brg_sha1.h
@@ -0,0 +1,109 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#include <stdlib.h>
+#include "brg_types.h"
+
+#define SHA1_BLOCK_SIZE  64
+#define SHA1_DIGEST_SIZE 20
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+#define POS_MASK    0x7fffffff
+#define HIGH_BITS   0x80000000
+
+#define sha1_context sha1_ctx_s
+typedef u_int8_t RNG_state;
+typedef u_int32_t  uint32;
+//typedef char *   caddr_t;
+
+/**********************************/
+/* random number generator state  */
+/**********************************/
+struct state_t {
+  u_int8_t state[20];
+};
+
+
+/***************************************/
+/* random number generator operations  */
+/***************************************/
+void   rng_init(RNG_state *state, int seed);
+void   rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber);
+int    rng_rand(RNG_state *mystate);
+int    rng_nextrand(RNG_state *mystate);
+char * rng_showstate(RNG_state *state, char *s);
+void   rng_showtype( void );
+
+/** END: UTS RNG Harness **/
+/* type to hold the SHA256 context  */
+
+struct sha1_ctx_s
+{   uint_32t count[2];
+    uint_32t hash[5];
+    uint_32t wbuf[16];
+};
+
+typedef struct sha1_ctx_s sha1_ctx;
+
+/* Note that these prototypes are the same for both bit and */
+/* byte oriented implementations. However the length fields */
+/* are in bytes or bits as appropriate for the version used */
+/* and bit sequences are input as arrays of bytes in which  */
+/* bit sequences run from the most to the least significant */
+/* end of each byte                                         */
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1]);
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1]);
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]);
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]);
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ompss/uts/brg_types.h b/ompss/uts/brg_types.h
new file mode 100644
index 0000000..ae9f717
--- /dev/null
+++ b/ompss/uts/brg_types.h
@@ -0,0 +1,205 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form uint<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#include <sys/types.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+/* Try this if you you get an error from one of the typedefs below */
+#ifdef BRG_STD_TYPES
+#  define BRG_UI8
+     typedef u_int8_t uint_8t;
+#  define BRG_UI16
+     typedef u_int16_t uint_16t;
+#  define BRG_UI32
+#    define li_32(h) 0x##h##u
+     typedef u_int32_t uint_32t;
+#  define BRG_UI64
+#    define li_64(h) 0x##h##u
+     typedef u_int64_t uint_64t;
+#endif /* BRG_C99_TYPES */
+
+#ifndef BRG_UI8
+#  define BRG_UI8
+#  if UCHAR_MAX == 255u
+     typedef unsigned char uint_8t;
+#  else
+#    error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI16
+#  define BRG_UI16
+#  if USHRT_MAX == 65535u
+     typedef unsigned short uint_16t;
+#  else
+#    error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI32
+#  define BRG_UI32
+#  if UINT_MAX == 4294967295u
+#    define li_32(h) 0x##h##u
+     typedef unsigned int uint_32t;
+#  elif ULONG_MAX == 4294967295u
+#    define li_32(h) 0x##h##ul
+     typedef unsigned long uint_32t;
+#  elif defined( _CRAY )
+#    error This code needs 32-bit data types, which Cray machines do not provide
+#  else
+#    error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+#  endif
+#endif
+
+#ifndef BRG_UI64
+#  if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned __int64 uint_64t;
+#  elif defined( _MSC_VER ) && ( _MSC_VER < 1300 )    /* 1300 == VC++ 7.0 */
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ui64
+     typedef unsigned __int64 uint_64t;
+#  elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+#    define BRG_UI64
+#    define li_64(h) 0x##h##ull
+     typedef unsigned long long uint_64t;
+#  elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
+#    if UINT_MAX == 18446744073709551615u
+#      define BRG_UI64
+#      define li_64(h) 0x##h##u
+       typedef unsigned int uint_64t;
+#    endif
+#  elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
+#    if ULONG_MAX == 18446744073709551615ul
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ul
+       typedef unsigned long uint_64t;
+#    endif
+#  elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
+#    if ULLONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
+#    if ULONG_LONG_MAX == 18446744073709551615ull
+#      define BRG_UI64
+#      define li_64(h) 0x##h##ull
+       typedef unsigned long long uint_64t;
+#    endif
+#  endif
+#endif
+
+#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 )
+#  error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+#  define RETURN_VALUES
+#  if defined( DLL_EXPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllexport ) void __stdcall
+#      define INT_RETURN     __declspec( dllexport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllexport__ ) void
+#      define INT_RETURN     __declspec( __dllexport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( DLL_IMPORT )
+#    if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
+#      define VOID_RETURN    __declspec( dllimport ) void __stdcall
+#      define INT_RETURN     __declspec( dllimport ) int  __stdcall
+#    elif defined( __GNUC__ )
+#      define VOID_RETURN    __declspec( __dllimport__ ) void
+#      define INT_RETURN     __declspec( __dllimport__ ) int
+#    else
+#      error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+#    endif
+#  elif defined( __WATCOMC__ )
+#    define VOID_RETURN  void __cdecl
+#    define INT_RETURN   int  __cdecl
+#  else
+#    define VOID_RETURN  void
+#    define INT_RETURN   int
+#  endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length 
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' 
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a 
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a 
+                                varaiable of length 'size' bits
+*/
+
+#define ui_type(size)               uint_##size##t
+#define dec_unit_type(size,x)       typedef ui_type(size) x
+#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x,size)            ((ui_type(size)*)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ompss/uts/uts.c b/ompss/uts/uts.c
new file mode 100644
index 0000000..a380315
--- /dev/null
+++ b/ompss/uts/uts.c
@@ -0,0 +1,279 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/**********************************************************************************************/
+/*
+ * Copyright (c) 2007 The Unbalanced Tree Search (UTS) Project Team:
+ * -----------------------------------------------------------------
+ *  
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.
+ *
+ * University of Maryland:
+ *   Chau-Wen Tseng(1)  <tseng at cs.umd.edu>
+ *
+ * University of North Carolina, Chapel Hill:
+ *   Jun Huan         <huan,
+ *   Jinze Liu         liu,
+ *   Stephen Olivier   olivier,
+ *   Jan Prins*        prins at cs.umd.edu>
+ * 
+ * The Ohio State University:
+ *   James Dinan      <dinan,
+ *   Gerald Sabin      sabin,
+ *   P. Sadayappan*    saday at cse.ohio-state.edu>
+ *
+ * Supercomputing Research Center
+ *   D. Pryor
+ *
+ * (1) - indicates project PI
+ *
+ * UTS Recursive Depth-First Search (DFS) version developed by James Dinan
+ *
+ * Adapted for OpenMP 3.0 Task-based version by Stephen Olivier
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+#include <sys/time.h>
+
+#include "app-desc.h"
+#include "bots.h"
+#include "uts.h"
+
+/***********************************************************
+ *  Global state                                           *
+ ***********************************************************/
+unsigned long long nLeaves = 0;
+int maxTreeDepth = 0;
+/***********************************************************
+ * Tree generation strategy is controlled via various      *
+ * parameters set from the command line.  The parameters   *
+ * and their default values are given below.               *
+ * Trees are generated using a Galton-Watson process, in   *
+ * which the branching factor of each node is a random     *
+ * variable.                                               *
+ *                                                         *
+ * The random variable follow a binomial distribution.     *
+ ***********************************************************/
+double b_0   = 4.0; // default branching factor at the root
+int   rootId = 0;   // default seed for RNG state at root
+/***********************************************************
+ *  The branching factor at the root is specified by b_0.
+ *  The branching factor below the root follows an 
+ *     identical binomial distribution at all nodes.
+ *  A node has m children with prob q, or no children with 
+ *     prob (1-q).  The expected branching factor is q * m.
+ *
+ *  Default parameter values 
+ ***********************************************************/
+int    nonLeafBF   = 4;            // m
+double nonLeafProb = 15.0 / 64.0;  // q
+/***********************************************************
+ * compute granularity - number of rng evaluations per
+ * tree node
+ ***********************************************************/
+int computeGranularity = 1;
+/***********************************************************
+ * expected results for execution
+ ***********************************************************/
+unsigned long long  exp_tree_size = 0;
+int        exp_tree_depth = 0;
+unsigned long long  exp_num_leaves = 0;
+/***********************************************************
+ *  FUNCTIONS                                              *
+ ***********************************************************/
+
+// Interpret 32 bit positive integer as value on [0,1)
+double rng_toProb(int n)
+{
+  if (n < 0) {
+    printf("*** toProb: rand n = %d out of range\n",n);
+  }
+  return ((n<0)? 0.0 : ((double) n)/2147483648.0);
+}
+
+void uts_initRoot(Node * root)
+{
+   root->height = 0;
+   root->numChildren = -1;      // means not yet determined
+   rng_init(root->state.state, rootId);
+
+   bots_message("Root node at %p\n", root);
+}
+
+
+int uts_numChildren_bin(Node * parent)
+{
+  // distribution is identical everywhere below root
+  int    v = rng_rand(parent->state.state);	
+  double d = rng_toProb(v);
+
+  return (d < nonLeafProb) ? nonLeafBF : 0;
+}
+
+int uts_numChildren(Node *parent)
+{
+  int numChildren = 0;
+
+  /* Determine the number of children */
+  if (parent->height == 0) numChildren = (int) floor(b_0);
+  else numChildren = uts_numChildren_bin(parent);
+  
+  // limit number of children
+  // only a BIN root can have more than MAXNUMCHILDREN
+  if (parent->height == 0) {
+    int rootBF = (int) ceil(b_0);
+    if (numChildren > rootBF) {
+      bots_debug("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF);
+      numChildren = rootBF;
+    }
+  }
+  else {
+    if (numChildren > MAXNUMCHILDREN) {
+      bots_debug("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN);
+      numChildren = MAXNUMCHILDREN;
+    }
+  }
+
+  return numChildren;
+}
+
+/***********************************************************
+ * Recursive depth-first implementation                    *
+ ***********************************************************/
+
+unsigned long long parallel_uts ( Node *root )
+{
+   unsigned long long num_nodes = 0 ;
+   root->numChildren = uts_numChildren(root);
+
+   bots_message("Computing Unbalance Tree Search algorithm ");
+
+   num_nodes = parTreeSearch( 0, root, root->numChildren );
+
+   bots_message(" completed!");
+
+   return num_nodes;
+}
+
+unsigned long long parTreeSearch(int depth, Node *parent, int numChildren) 
+{
+  Node n[numChildren], *nodePtr;
+  int i, j;
+  unsigned long long subtreesize = 1, partialCount[numChildren];
+
+  // Recurse on the children
+  for (i = 0; i < numChildren; i++) {
+     nodePtr = &n[i];
+
+     nodePtr->height = parent->height + 1;
+
+     // The following line is the work (one or more SHA-1 ops)
+     for (j = 0; j < computeGranularity; j++) {
+        rng_spawn(parent->state.state, nodePtr->state.state, i);
+     }
+
+     nodePtr->numChildren = uts_numChildren(nodePtr);
+
+     #pragma omp task untied firstprivate(i, nodePtr) shared(partialCount)
+        partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren);
+  }
+
+  #pragma omp taskwait
+
+  for (i = 0; i < numChildren; i++) {
+     subtreesize += partialCount[i];
+  }
+  
+  return subtreesize;
+}
+
+void uts_read_file ( char *filename )
+{
+   FILE *fin;
+
+   if ((fin = fopen(filename, "r")) == NULL) {
+      bots_message("Could not open input file (%s)\n", filename);
+      exit (-1);
+   }
+   fscanf(fin,"%lf %lf %d %d %d %llu %d %llu",
+             &b_0,
+             &nonLeafProb,
+             &nonLeafBF,
+             &rootId,
+             &computeGranularity,
+             &exp_tree_size,
+             &exp_tree_depth,
+             &exp_num_leaves
+   );
+   fclose(fin);
+
+   computeGranularity = max(1,computeGranularity);
+
+   // Printing input data
+   bots_message("\n");
+   bots_message("Root branching factor                = %f\n", b_0);
+   bots_message("Root seed (0 <= 2^31)                = %d\n", rootId);
+   bots_message("Probability of non-leaf node         = %f\n", nonLeafProb);
+   bots_message("Number of children for non-leaf node = %d\n", nonLeafBF);
+   bots_message("E(n)                                 = %f\n", (double) ( nonLeafProb * nonLeafBF ) );
+   bots_message("E(s)                                 = %f\n", (double) ( 1.0 / (1.0 - nonLeafProb * nonLeafBF) ) );
+   bots_message("Compute granularity                  = %d\n", computeGranularity);
+   bots_message("Random number generator              = "); rng_showtype();
+}
+
+void uts_show_stats( void )
+{
+   int nPes = atoi(bots_resources);
+   int chunkSize = 0;
+
+   bots_message("\n");
+   bots_message("Tree size                            = %llu\n", (unsigned long long)  bots_number_of_tasks );
+   bots_message("Maximum tree depth                   = %d\n", maxTreeDepth );
+   bots_message("Chunk size                           = %d\n", chunkSize );
+   bots_message("Number of leaves                     = %llu (%.2f%%)\n", nLeaves, nLeaves/(float)bots_number_of_tasks*100.0 ); 
+   bots_message("Number of PE's                       = %.4d threads\n", nPes );
+   bots_message("Wallclock time                       = %.3f sec\n", bots_time_program );
+   bots_message("Overall performance                  = %.0f nodes/sec\n", (bots_number_of_tasks / bots_time_program) );
+   bots_message("Performance per PE                   = %.0f nodes/sec\n", (bots_number_of_tasks / bots_time_program / nPes) );
+}
+
+int uts_check_result ( void )
+{
+   int answer = BOTS_RESULT_SUCCESSFUL;
+
+   if ( bots_number_of_tasks != exp_tree_size ) {
+      answer = BOTS_RESULT_UNSUCCESSFUL;
+      bots_message("Incorrect tree size result (%llu instead of %llu).\n", bots_number_of_tasks, exp_tree_size);
+   }
+
+   return answer;
+}
diff --git a/ompss/uts/uts.h b/ompss/uts/uts.h
new file mode 100644
index 0000000..c913b24
--- /dev/null
+++ b/ompss/uts/uts.h
@@ -0,0 +1,81 @@
+/*
+ *         ---- The Unbalanced Tree Search (UTS) Benchmark ----
+ *  
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.  See AUTHORS file for more information.
+ *
+ *  ** THIS IS A PRE-RELEASE VERSION OF UTS. **
+ */
+
+#ifndef _UTS_H
+#define _UTS_H
+
+#include "brg_sha1.h"
+
+#define UTS_VERSION "2.1"
+
+/***********************************************************
+ *  Tree node descriptor and statistics                    *
+ ***********************************************************/
+
+#define MAXNUMCHILDREN    100  // cap on children (BIN root is exempt)
+
+struct node_t {
+  int height;        // depth of this node in the tree
+  int numChildren;   // number of children, -1 => not yet determined
+  
+  /* for RNG state associated with this node */
+  struct state_t state;
+};
+
+typedef struct node_t Node;
+
+/* Tree type
+ *   Trees are generated using a Galton-Watson process, in 
+ *   which the branching factor of each node is a random 
+ *   variable.
+ *   
+ *   The random variable can follow a binomial distribution
+ *   or a geometric distribution.  Hybrid tree are
+ *   generated with geometric distributions near the
+ *   root and binomial distributions towards the leaves.
+ */
+/* Tree  parameters */
+extern double     b_0;
+extern int        rootId;
+extern int        nonLeafBF;
+extern double     nonLeafProb;
+
+/* Benchmark parameters */
+extern int    computeGranularity;
+extern int    debug;
+extern int    verbose;
+
+/* Utility Functions */
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+
+unsigned long long parTreeSearch(int depth, Node *parent, int numChildren);
+
+int    uts_paramsToStr(char *strBuf, int ind);
+void   uts_read_file(char *file);
+void   uts_print_params();
+
+double rng_toProb(int n);
+
+/* Common tree routines */
+void   uts_initRoot(Node * root);
+int    uts_numChildren(Node *parent);
+int    uts_numChildren_bin(Node * parent);
+int    uts_numChildren_geo(Node * parent);
+int    uts_childType(Node *parent);
+
+void uts_show_stats( void );
+int uts_check_result ( void );
+
+#endif /* _UTS_H */