diff --git a/common/Makefile.common b/common/Makefile.common index 1c73f0f..4edd2e3 100644 --- a/common/Makefile.common +++ b/common/Makefile.common @@ -37,11 +37,13 @@ TODAY := $(shell date "+%Y/%m/%d;%H:%M") # Compile commands +OMPSSC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(OMPSSC_FLAGS) $(APP_FLAGS) OMPC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(OMPC_FLAGS) $(APP_FLAGS) CC_ALL_FLAGS=-c -I$(COMMON_DIR) $(OPT_FLAGS) $(CC_FLAGS) $(APP_FLAGS) # Link commands +OMPSSLINK_ALL_FLAGS=$(OPT_FLAGS) $(OMPSSLINK_FLAGS) $(APP_FLAGS) OMPLINK_ALL_FLAGS=$(OPT_FLAGS) $(OMPLINK_FLAGS) $(APP_FLAGS) CLINK_ALL_FLAGS=$(OPT_FLAGS) $(CC_FLAGS) $(APP_FLAGS) @@ -216,6 +218,137 @@ endif endif +#ifeq ( $(shell [[ $(VERSION) == ompss && $(OMPSSC) == mcc ]] && echo true ), true ) +ifeq ($(VERSION),ompss) +ifeq ($(ENABLE_OMPSS),yes) + +TARGETS = $(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION) + +ifdef CUTOFF_VERSIONS + TARGETS += $(CUTOFF_VERSIONS:%=$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-%) + + MANUAL_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-manual.o) + IF_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-if.o) + FINAL_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-final.o) + + MANUAL_FLAGS = -DMANUAL_CUTOFF + IF_FLAGS = -DIF_CUTOFF + FINAL_FLAGS = -DFINAL_CUTOFF $(OMPSSC_FINAL_FLAGS) +endif + +ifdef TIED_VERSIONS + TIED_TARGETS := $(TARGETS:%=%-tied) + TARGETS += $(TIED_TARGETS) + + TIED_PROGRAM_OBJS := $(PROGRAM_OBJS:%.o=%-tied.o) + TIED_MANUAL_PROGRAM_OBJS := $(MANUAL_PROGRAM_OBJS:%.o=%-tied.o) + TIED_IF_PROGRAM_OBJS := $(IF_PROGRAM_OBJS:%.o=%-tied.o) + TIED_FINAL_PROGRAM_OBJS := $(FINAL_PROGRAM_OBJS:%.o=%-tied.o) + + TIED_FLAGS = -DFORCE_TIED_TASKS +endif + + +all: $(TARGETS) + +.c.o: Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) -o $@ $< + +%-if.o: %.c Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) -o $@ $< + +%-final.o: %.c Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) -o $@ $< + +%-manual.o: %.c Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -o $@ $< + +# we remove the untied clause with sed. +# For this to work it must be the first clause of the task directive +# Ugly... but there's no easy solutions because it is a pragma + +%-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common + cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\ + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -o $@ tied-$<;\ + rm tied-$< + +%-if-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common + cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\ + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) $(TIED_FLAGS) -o $@ $<;\ + rm tied-$< + +%-manual-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common + cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\ + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) $(TIED_FLAGS) -o $@ $<;\ + rm tied-$< + +%-final-tied.o: %.c Makefile $(COMMON_DIR)/Makefile.common + cat $< | sed -e "s/task \{1,\}untied/task/g" > tied-$< ;\ + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) $(TIED_FLAGS) -o $@ $<;\ + rm tied-$< + +INFO_FLAGS_OMPSS=-DCDATE="\"$(TODAY)\"" -DCC="\"$(OMPSSC)\"" -DLD="\"$(OMPSSLINK)\"" -DCMESSAGE="\"$(CMESSAGE)\"" \ + -DLDFLAGS="\"$(OMPSSLINK_ALL_FLAGS) $(LIBS)\"" + +main.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) -I.\"" + +main-if.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(IF_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(IF_FLAGS)-I.\"" + +main-final.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(FINAL_FLAGS)-I.\"" + +main-manual.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(MANUAL_FLAGS) -I.\"" + +main-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) -I.\"" + +main-if-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(IF_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(IF_FLAGS) -I.\"" + +main-manual-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(MANUAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(MANUAL_FLAGS) -I.\"" + +main-final-tied.o: $(COMMON_DIR)/bots_main.c app-desc.h Makefile $(COMMON_DIR)/Makefile.common + $(OMPSSC) $(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(FINAL_FLAGS) -I. -o $@ $< $(INFO_FLAGS_OMPSS) -DCFLAGS="\"$(OMPSSC_ALL_FLAGS) $(TIED_FLAGS) $(FINAL_FLAGS) -I.\"" + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION): main.o $(PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main.o $(PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-manual: main-manual.o $(MANUAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-manual.o $(MANUAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-if_clause: main-if.o $(IF_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if.o $(IF_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-tied: main-tied.o $(TIED_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-tied.o $(TIED_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-manual-tied: main-manual-tied.o $(TIED_MANUAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-manual-tied.o $(TIED_MANUAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-if_clause-tied: main-if-tied.o $(TIED_IF_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if-tied.o $(TIED_IF_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +ifdef USE_FINAL_CLAUSE + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final: main-final.o $(FINAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-final.o $(FINAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final-tied: main-final-tied.o $(TIED_FINAL_PROGRAM_OBJS) Makefile $(COMMON_DIR)/Makefile.common $(COMMON_OBJS) + $(OMPSSLINK) $(OMPSSLINK_ALL_FLAGS) -o $@ main-if-tied.o $(TIED_FINAL_PROGRAM_OBJS) $(LIBS) $(COMMON_OBJS) + +else + +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final: +$(BIN_DIR)/$(PROGRAM).$(LABEL).$(SUB)$(VERSION)-final-tied: + +endif + +endif +endif clean: rm -fr *.o diff --git a/common/ompss-app.h b/common/ompss-app.h new file mode 100644 index 0000000..3d2190b --- /dev/null +++ b/common/ompss-app.h @@ -0,0 +1,31 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include + +#define MODEL OMPSS + +#ifdef FORCE_TIED_TASKS +#define BOTS_MODEL_DESC "OmpSs (using tied tasks)" +#else +#define BOTS_MODEL_DESC "OmpSs (using tasks)" +#endif + + diff --git a/configure b/configure index 2e5445d..31efa43 100755 --- a/configure +++ b/configure @@ -13,21 +13,20 @@ show_help () } while [ "$#" -gt 0 ]; do - case $1 in - --debug) debug=yes - ;; - --warnings) warnings=yes - ;; - --compiler) shift; COMPILER=$1 - ;; - --help) - show_help - exit - ;; - *) echo "Unknown option $1 (skipping)" - ;; - esac - shift + case $1 in + --debug) debug=yes + ;; + --warnings) warnings=yes + ;; + --compiler) shift; COMPILER=$1 + ;; + --help) + show_help + exit + *) echo "Unknown option $1 (skipping)" + ;; + esac + shift done GCC=`gcc -x c -c -fopenmp /dev/null -o /dev/null &>/dev/null && echo "yes"` @@ -42,49 +41,48 @@ nc=0 if [ -z "$COMPILER" ]; then if [ "$GCC" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="gcc" + let nc=nc+1 + COMPILERS[nc]="gcc" fi if [ "$MCC" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="mcc" + let nc=nc+1 + COMPILERS[nc]="mcc" fi if [ "$ICC" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="icc" + let nc=nc+1 + COMPILERS[nc]="icc" fi if [ "$XLC" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="xlc" + let nc=nc+1 + COMPILERS[nc]="xlc" fi if [ "$PGI" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="pgi" + let nc=nc+1 + COMPILERS[nc]="pgi" fi if [ "$SUN" = "yes" ]; then - let nc=nc+1 - COMPILERS[nc]="sunstudio" + let nc=nc+1 + COMPILERS[nc]="sunstudio" fi if [ "$nc" -gt "0" ]; then - echo "The following compilers are recognized: " - n=1 - for comp in ${COMPILERS[*]}; do - echo " $n. $comp" - let n=n+1 - done - echo -n "Choose one to use:" - read - - COMPILER=${COMPILERS[$REPLY]} + echo "The following compilers are recognized: " + n=1 + for comp in ${COMPILERS[*]}; do + echo " $n. $comp" + let n=n+1 + done + echo -n "Choose one to use:" + read + COMPILER=${COMPILERS[$REPLY]} else - echo "No suitable compiler was detected" - echo "An empty $OUTPUT will be generated" + echo "No suitable compiler was detected" + echo "An empty $OUTPUT will be generated" fi fi @@ -93,125 +91,132 @@ fi [ "$warnings" = "yes" ] && WARNINGS= if [ "$COMPILER" = "gcc" -a "$GCC" = "yes" ]; then - CC=gcc - CLINK=$CC - OMPC="$CC -fopenmp" - OMPLINK="$CC -fopenmp" - LABEL=gcc - [ "$debug" = "yes" ] && DEBUG=$DEBUG -g - [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" - - OPT_FLAGS=-O3 - CC_FLAGS="$DEBUG $WARNINGS" - OMPC_FLAGS="$DEBUG $WARNINGS" - CLINK_FLAGS=$DEBUG - OMPLINK_FLAGS=$DEBUG + CC=gcc + CLINK=$CC + OMPC="$CC -fopenmp" + OMPLINK="$CC -fopenmp" + LABEL=gcc + [ "$debug" = "yes" ] && DEBUG=$DEBUG -g + [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" + + OPT_FLAGS=-O3 + CC_FLAGS="$DEBUG $WARNINGS" + OMPC_FLAGS="$DEBUG $WARNINGS" + CLINK_FLAGS=$DEBUG + OMPLINK_FLAGS=$DEBUG fi if [ "$COMPILER" = "icc" -a "$ICC" = "yes" ]; then - CC=icc - CLINK=$CC - OMPC="$CC -openmp" - OMPLINK="$CC -openmp" - LABEL=icc - [ "$debug" = "yes" ] && DEBUG=$DEBUG -g - [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" - - OPT_FLAGS=-O2 - CC_FLAGS="$DEBUG $WARNINGS" - OMPC_FLAGS="$DEBUG $WARNINGS" - CLINK_FLAGS=$DEBUG - OMPLINK_FLAGS=$DEBUG + CC=icc + CLINK=$CC + OMPC="$CC -openmp" + OMPLINK="$CC -openmp" + LABEL=icc + [ "$debug" = "yes" ] && DEBUG=$DEBUG -g + [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" + + OPT_FLAGS=-O2 + CC_FLAGS="$DEBUG $WARNINGS" + OMPC_FLAGS="$DEBUG $WARNINGS" + CLINK_FLAGS=$DEBUG + OMPLINK_FLAGS=$DEBUG fi if [ "$COMPILER" = "mcc" -a "$MCC" = "yes" ]; then - case $(uname -i) in - x86_64) extras="-m32" - ;; - esac - - CC="mcc $extras" - CLINK=$CC - OMPC=$CC - OMPLINK=$CC - LABEL=mcc - [ "$debug" = "yes" ] && DEBUG=$DEBUG -g - [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" - - OPT_FLAGS=-O3 - CC_FLAGS="$DEBUG $WARNINGS --no-openmp" - OMPC_FLAGS="$DEBUG $WARNINGS" - CLINK_FLAGS="$DEBUG --no-openmp" - OMPLINK_FLAGS=$DEBUG - OMPC_FINAL_FLAGS="--serialize" - - supports_final_clause=yes + case $(uname -i) in + x86_64) extras="-m32" + ;; + esac + + CC="mcc $extras" + CLINK=$CC + OMPC=$CC + OMPLINK=$CC + LABEL=mcc + [ "$debug" = "yes" ] && DEBUG=$DEBUG -g + [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -Wall -Werror" + + OPT_FLAGS=-O3 + CC_FLAGS="$DEBUG $WARNINGS --no-openmp" + OMPC_FLAGS="$DEBUG $WARNINGS" + CLINK_FLAGS="$DEBUG --no-openmp" + OMPLINK_FLAGS=$DEBUG + OMPC_FINAL_FLAGS="--serialize" + + ENABLE_OMPSS=yes + + OMPSSC=mcc + OMPSSLINK=mcc + OMPSSC_FLAGS=--ompss + OMPSSLINK_FLAGS=--ompss + + supports_final_clause=yes fi if [ "$COMPILER" = "xlc" -a "$XLC" = "yes" ]; then - CC=xlc_r - CLINK=$CC - OMPC="$CC -qsmp=omp" - OMPLINK=$OMPC - LABEL=xlc - [ "$debug" = "yes" ] && DEBUG=$DEBUG -g - [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -qflag=w:w -qhalt=w" - - OPT_FLAGS=-O3 - CC_FLAGS="$DEBUG $WARNINGS" - OMPC_FLAGS="-qthreaded $DEBUG $WARNINGS" - CLINK_FLAGS=$DEBUG - OMPLINK_FLAGS="-qthreaded $DEBUG" + CC=xlc_r + CLINK=$CC + OMPC="$CC -qsmp=omp" + OMPLINK=$OMPC + LABEL=xlc + [ "$debug" = "yes" ] && DEBUG=$DEBUG -g + [ "$warnings" = "yes" ] && WARNINGS="$WARNINGS -qflag=w:w -qhalt=w" + + OPT_FLAGS=-O3 + CC_FLAGS="$DEBUG $WARNINGS" + OMPC_FLAGS="-qthreaded $DEBUG $WARNINGS" + CLINK_FLAGS=$DEBUG + OMPLINK_FLAGS="-qthreaded $DEBUG" fi if [ "$COMPILER" = "pgi" -a "$PGI" = "yes" ]; then - CC=pgcc - CLINK=$CC - OMPC="$CC -mp -Minfo=mp" - OMPLINK=$OMPC - LABEL=pgi - [ "$debug" = "yes" ] && DEBUG="$DEBUG -g" - if [ "$warnings" = "yes" ]; then - echo "The pgi compiler doesn't support '--warnings' option (skipping it)" - echo "Press (Enter) to continue..." - read - fi - - OPT_FLAGS=-fast - CC_FLAGS="$DEBUG $WARNINGS" - OMPC_FLAGS="$DEBUG $WARNINGS" - CLINK_FLAGS=$DEBUG - OMPLINK_FLAGS=$DEBUG + CC=pgcc + CLINK=$CC + OMPC="$CC -mp -Minfo=mp" + OMPLINK=$OMPC + LABEL=pgi + [ "$debug" = "yes" ] && DEBUG="$DEBUG -g" + if [ "$warnings" = "yes" ]; then + echo "The pgi compiler doesn't support '--warnings' option (skipping it)" + echo "Press (Enter) to continue..." + read + fi + + OPT_FLAGS=-fast + CC_FLAGS="$DEBUG $WARNINGS" + OMPC_FLAGS="$DEBUG $WARNINGS" + CLINK_FLAGS=$DEBUG + OMPLINK_FLAGS=$DEBUG fi if [ "$COMPILER" = "sunstudio" -a "$SUN" = "yes" ]; then - CC=cc - CLINK=$CC - OMPC="$CC -xopenmp" - OMPLINK=$OMPC - LABEL=suns - [ "$debug" = "yes" ] && DEBUG=$DEBUG -g - if [ "$warnings" = "yes" ]; then - echo "The sunstudio compiler doesn't support '--warnings' option (skipping it)" - echo "Press (Enter) to continue..." - read - fi - - OPT_FLAGS=-fast - CC_FLAGS="$DEBUG $WARNINGS" - OMPC_FLAGS="$DEBUG $WARNINGS" - CLINK_FLAGS=$DEBUG - OMPLINK_FLAGS=$DEBUG + CC=cc + CLINK=$CC + OMPC="$CC -xopenmp" + OMPLINK=$OMPC + LABEL=suns + [ "$debug" = "yes" ] && DEBUG=$DEBUG -g + if [ "$warnings" = "yes" ]; then + echo "The sunstudio compiler doesn't support '--warnings' option (skipping it)" + echo "Press (Enter) to continue..." + read + fi + + OPT_FLAGS=-fast + CC_FLAGS="$DEBUG $WARNINGS" + OMPC_FLAGS="$DEBUG $WARNINGS" + CLINK_FLAGS=$DEBUG + OMPLINK_FLAGS=$DEBUG fi if [ -z "$CC" ]; then - echo "Wrong compiler configuration" - exit 1 + echo "Wrong compiler configuration" + exit 1 fi if [ "$debug" = "yes" ]; then - LABEL="$LABEL-debug" + LABEL="$LABEL-debug" fi [ -f $OUTPUT ] && replacing_config=true @@ -223,9 +228,13 @@ cat > $OUTPUT << EOF #config name LABEL=$LABEL +ENABLE_OMPSS=$ENABLE_OMPSS + #compilers +OMPSSC=$OMPSSC OMPC=$OMPC CC=$CC +OMPSSLINK=$OMPSSLINK OMPLINK=$OMPLINK CLINK=$CLINK @@ -235,10 +244,13 @@ OPT_FLAGS=$OPT_FLAGS CC_FLAGS=$CC_FLAGS OMPC_FLAGS=$OMPC_FLAGS +OMPSSC_FLAGS=$OMPSSC_FLAGS OMPC_FINAL_FLAGS=$OMPC_FINAL_FLAGS +OMPSSC_FINAL_FLAG=$OMPSSC_FINAL_FLAGS CLINK_FLAGS=$CLINK_FLAGS OMPLINK_FLAGS=$OMPLINK_FLAGS +OMPSSLINK_FLAGS=$OMPSSLINK_FLAGS EOF @@ -248,18 +260,18 @@ EOF echo "make.config generated" if [ "$replacing_config" ]; then - echo "Configuration was changed. Cleaning up" - make clean + echo "Configuration was changed. Cleaning up" + make clean fi [ -d bin ] || mkdir -p bin if make -v | grep GNU &> /dev/null; then - echo "Run make to compile the benchmarks" + echo "Run make to compile the benchmarks" elif gmake -v | grep GNU &> /dev/null; then - echo "Run gmake to compile the benchmarks" + echo "Run gmake to compile the benchmarks" else - echo "I didn't find a GNU-compatible make. You'll need it to compile the benchmarks" + echo "I didn't find a GNU-compatible make. You'll need it to compile the benchmarks" fi echo "You can further refine your configuration in config/make.config" diff --git a/ompss/Makefile b/ompss/Makefile new file mode 100644 index 0000000..cffe95c --- /dev/null +++ b/ompss/Makefile @@ -0,0 +1,40 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#DIRS=fib alignment nqueens sort strassen sparselu fft floorplan health uts +DIRS=fib + +RECURSIVE=all-recursive clean-recursive dist-clean-recursive + +all: all-recursive +clean: clean-recursive +dist-clean: dist-clean-recursive + +$(RECURSIVE): + @failcom='exit 1';\ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $(DIRS); do \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$target) \ + || eval $$failcom; \ + done; + +dist: dist-clean + echo "TODO" + diff --git a/ompss/Makefile.version b/ompss/Makefile.version new file mode 100644 index 0000000..c06a272 --- /dev/null +++ b/ompss/Makefile.version @@ -0,0 +1,21 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +VERSION=ompss diff --git a/ompss/fft/Makefile b/ompss/fft/Makefile new file mode 100644 index 0000000..c19750a --- /dev/null +++ b/ompss/fft/Makefile @@ -0,0 +1,35 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +LIBS = -lm +#PROGRAM_OBJS= + +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/fft/app-desc.h b/ompss/fft/app-desc.h new file mode 100644 index 0000000..071c202 --- /dev/null +++ b/ompss/fft/app-desc.h @@ -0,0 +1,56 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" +#include "fft.h" + +#define BOTS_APP_NAME "FFT" +#define BOTS_APP_PARAMETERS_DESC "Size=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 32*1024*1024 +#define BOTS_APP_DESC_ARG_SIZE "Matrix Size" + +#define BOTS_APP_INIT int i;\ + COMPLEX *in, *out1=NULL, *out2=NULL;\ + in = malloc(bots_arg_size * sizeof(COMPLEX));\ + +#define KERNEL_INIT\ + out1 = malloc(bots_arg_size * sizeof(COMPLEX));\ + for (i = 0; i < bots_arg_size; ++i) {\ + c_re(in[i]) = 1.0;\ + c_im(in[i]) = 1.0;\ + } +#define KERNEL_CALL fft(bots_arg_size, in, out1); +#define KERNEL_FINI + +#define KERNEL_SEQ_INIT\ + out2 = malloc(bots_arg_size * sizeof(COMPLEX));\ + for (i = 0; i < bots_arg_size; ++i) {\ + c_re(in[i]) = 1.0;\ + c_im(in[i]) = 1.0;\ + } +#define KERNEL_SEQ_CALL fft_seq(bots_arg_size, in, out2); +#define KERNEL_SEQ_FINI + +#define BOTS_APP_CHECK_USES_SEQ_RESULT +#define KERNEL_CHECK test_correctness(bots_arg_size, out1, out2) + diff --git a/ompss/fft/fft.c b/ompss/fft/fft.c new file mode 100644 index 0000000..341d5ef --- /dev/null +++ b/ompss/fft/fft.c @@ -0,0 +1,4854 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* + * Original code from the Cilk project + * + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + */ + +#include +#include +#include +#include +#include "bots.h" +#include "app-desc.h" + +/* Definitions and operations for complex numbers */ + +/* + * compute the W coefficients (that is, powers of the root of 1) + * and store them into an array. + */ +void compute_w_coefficients(int n, int a, int b, COMPLEX * W) +{ + register double twoPiOverN; + register int k; + register REAL s, c; + + if (b - a < 128) { + twoPiOverN = 2.0 * 3.1415926535897932384626434 / n; + for (k = a; k <= b; ++k) { + c = cos(twoPiOverN * k); + c_re(W[k]) = c_re(W[n - k]) = c; + s = sin(twoPiOverN * k); + c_im(W[k]) = -s; + c_im(W[n - k]) = s; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + compute_w_coefficients(n, a, ab, W); + #pragma omp task untied + compute_w_coefficients(n, ab + 1, b, W); + #pragma omp taskwait + } +} +void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W) +{ + register double twoPiOverN; + register int k; + register REAL s, c; + + if (b - a < 128) { + twoPiOverN = 2.0 * 3.1415926535897932384626434 / n; + for (k = a; k <= b; ++k) { + c = cos(twoPiOverN * k); + c_re(W[k]) = c_re(W[n - k]) = c; + s = sin(twoPiOverN * k); + c_im(W[k]) = -s; + c_im(W[n - k]) = s; + } + } else { + int ab = (a + b) / 2; + compute_w_coefficients_seq(n, a, ab, W); + compute_w_coefficients_seq(n, ab + 1, b, W); + } +} +/* + * Determine (in a stupid way) if n is divisible by eight, then by four, else + * find the smallest prime factor of n. + */ +int factor(int n) +{ + int r; + + if (n < 2) return 1; + if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 || n == 4096) return 8; + if ((n & 15) == 0) return 16; + if ((n & 7) == 0) return 8; + if ((n & 3) == 0) return 4; + if ((n & 1) == 0) return 2; + + /* try odd numbers up to n (computing the sqrt may be slower) */ + for (r = 3; r < n; r += 2) if (n % r == 0) return r; + + /* n is prime */ + return n; +} + +void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m) +{ + int i, j; + int r4 = r & (~0x3); + const COMPLEX *ip; + COMPLEX *jp; + + if (b - a < 16) { + ip = in + a * r; + for (i = a; i < b; ++i) { + jp = out + i; + for (j = 0; j < r4; j += 4) { + jp[0] = ip[0]; + jp[m] = ip[1]; + jp[2 * m] = ip[2]; + jp[3 * m] = ip[3]; + jp += 4 * m; + ip += 4; + } + for (; j < r; ++j) { + *jp = *ip; + ip++; + jp += m; + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + unshuffle(a, ab, in, out, r, m); + #pragma omp task untied + unshuffle(ab, b, in, out, r, m); + #pragma omp taskwait + } +} +void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m) +{ + int i, j; + int r4 = r & (~0x3); + const COMPLEX *ip; + COMPLEX *jp; + + if (b - a < 16) { + ip = in + a * r; + for (i = a; i < b; ++i) { + jp = out + i; + for (j = 0; j < r4; j += 4) { + jp[0] = ip[0]; + jp[m] = ip[1]; + jp[2 * m] = ip[2]; + jp[3 * m] = ip[3]; + jp += 4 * m; + ip += 4; + } + for (; j < r; ++j) { + *jp = *ip; + ip++; + jp += m; + } + } + } else { + int ab = (a + b) / 2; + unshuffle_seq(a, ab, in, out, r, m); + unshuffle_seq(ab, b, in, out, r, m); + } +} +void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, + COMPLEX * W, int r, int m, + int nW, int nWdnti, int nWdntm) +{ + int j, k; + COMPLEX *jp, *kp; + + for (k = 0, kp = out; k < r; ++k, kp += m) { + REAL r0, i0, rt, it, rw, iw; + int l1 = nWdnti + nWdntm * k; + int l0; + + r0 = i0 = 0.0; + for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) { + rw = c_re(W[l0]); + iw = c_im(W[l0]); + rt = c_re(*jp); + it = c_im(*jp); + r0 += rt * rw - it * iw; + i0 += rt * iw + it * rw; + l0 += l1; + if (l0 > nW) + l0 -= nW; + } + c_re(*kp) = r0; + c_im(*kp) = i0; + } +} + +void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m) +{ + if (i == i1 - 1) { + #pragma omp task untied + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + #pragma omp task untied + fft_twiddle_gen(i, i2, in, out, W, nW, + nWdn, r, m); + #pragma omp task untied + fft_twiddle_gen(i2, i1, in, out, W, nW, + nWdn, r, m); + } + #pragma omp taskwait +} +void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, + int nW, int nWdn, int r, int m) +{ + if (i == i1 - 1) { + fft_twiddle_gen1(in + i, out + i, W, + r, m, nW, nWdn * i, nWdn * m); + } else { + int i2 = (i + i1) / 2; + fft_twiddle_gen_seq(i, i2, in, out, W, nW, + nWdn, r, m); + fft_twiddle_gen_seq(i2, i1, in, out, W, nW, + nWdn, r, m); + } +} +/* machine-generated code begins here */ +void fft_base_2(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(in[0]); + i1_0 = c_im(in[0]); + r1_1 = c_re(in[1]); + i1_1 = c_im(in[1]); + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[1]) = (r1_0 - r1_1); + c_im(out[1]) = (i1_0 - i1_1); +} +void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(jp[0 * m]); + i1_0 = c_im(jp[0 * m]); + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r1_1 = ((wr * tmpr) - (wi * tmpi)); + i1_1 = ((wi * tmpr) + (wr * tmpi)); + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[1 * m]) = (r1_0 - r1_1); + c_im(kp[1 * m]) = (i1_0 - i1_1); + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m); + #pragma omp taskwait + } +} +void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + r1_0 = c_re(jp[0 * m]); + i1_0 = c_im(jp[0 * m]); + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r1_1 = ((wr * tmpr) - (wi * tmpi)); + i1_1 = ((wi * tmpr) + (wr * tmpi)); + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[1 * m]) = (r1_0 - r1_1); + c_im(kp[1 * m]) = (i1_0 - i1_1); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_2_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_2_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 2; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_unshuffle_2(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_2(ab, b, in, out, m); + #pragma omp taskwait + } +} +void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 2; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_2_seq(a, ab, in, out, m); + fft_unshuffle_2_seq(ab, b, in, out, m); + } +} +void fft_base_4(COMPLEX * in, COMPLEX * out) +{ + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(in[0]); + i2_0 = c_im(in[0]); + r2_2 = c_re(in[2]); + i2_2 = c_im(in[2]); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + r2_1 = c_re(in[1]); + i2_1 = c_im(in[1]); + r2_3 = c_re(in[3]); + i2_3 = c_im(in[3]); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[2]) = (r1_0 - r1_1); + c_im(out[2]) = (i1_0 - i1_1); + c_re(out[1]) = (r1_2 + i1_3); + c_im(out[1]) = (i1_2 - r1_3); + c_re(out[3]) = (r1_2 - i1_3); + c_im(out[3]) = (i1_2 + r1_3); +} +void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(jp[0 * m]); + i2_0 = c_im(jp[0 * m]); + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r2_2 = ((wr * tmpr) - (wi * tmpi)); + i2_2 = ((wi * tmpr) + (wr * tmpi)); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r2_1 = ((wr * tmpr) - (wi * tmpi)); + i2_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r2_3 = ((wr * tmpr) - (wi * tmpi)); + i2_3 = ((wi * tmpr) + (wr * tmpi)); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[2 * m]) = (r1_0 - r1_1); + c_im(kp[2 * m]) = (i1_0 - i1_1); + c_re(kp[1 * m]) = (r1_2 + i1_3); + c_im(kp[1 * m]) = (i1_2 - r1_3); + c_re(kp[3 * m]) = (r1_2 - i1_3); + c_im(kp[3 * m]) = (i1_2 + r1_3); + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m); + #pragma omp taskwait + } +} +void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + r2_0 = c_re(jp[0 * m]); + i2_0 = c_im(jp[0 * m]); + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r2_2 = ((wr * tmpr) - (wi * tmpi)); + i2_2 = ((wi * tmpr) + (wr * tmpi)); + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_2 = (r2_0 - r2_2); + i1_2 = (i2_0 - i2_2); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r2_1 = ((wr * tmpr) - (wi * tmpi)); + i2_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r2_3 = ((wr * tmpr) - (wi * tmpi)); + i2_3 = ((wi * tmpr) + (wr * tmpi)); + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_3 = (r2_1 - r2_3); + i1_3 = (i2_1 - i2_3); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[2 * m]) = (r1_0 - r1_1); + c_im(kp[2 * m]) = (i1_0 - i1_1); + c_re(kp[1 * m]) = (r1_2 + i1_3); + c_im(kp[1 * m]) = (i1_2 - r1_3); + c_re(kp[3 * m]) = (r1_2 - i1_3); + c_im(kp[3 * m]) = (i1_2 + r1_3); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_4_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_4_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 4; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_unshuffle_4(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_4(ab, b, in, out, m); + #pragma omp taskwait + } +} +void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 4; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_4_seq(a, ab, in, out, m); + fft_unshuffle_4_seq(ab, b, in, out, m); + } +} +void fft_base_8(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(in[0]); + i3_0 = c_im(in[0]); + r3_4 = c_re(in[4]); + i3_4 = c_im(in[4]); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + r3_2 = c_re(in[2]); + i3_2 = c_im(in[2]); + r3_6 = c_re(in[6]); + i3_6 = c_im(in[6]); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + r3_1 = c_re(in[1]); + i3_1 = c_im(in[1]); + r3_5 = c_re(in[5]); + i3_5 = c_im(in[5]); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + r3_3 = c_re(in[3]); + i3_3 = c_im(in[3]); + r3_7 = c_re(in[7]); + i3_7 = c_im(in[7]); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[4]) = (r1_0 - r1_1); + c_im(out[4]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[5]) = (r1_2 - tmpr); + c_im(out[5]) = (i1_2 - tmpi); + c_re(out[2]) = (r1_4 + i1_5); + c_im(out[2]) = (i1_4 - r1_5); + c_re(out[6]) = (r1_4 - i1_5); + c_im(out[6]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 - tmpi); + c_re(out[7]) = (r1_6 - tmpr); + c_im(out[7]) = (i1_6 + tmpi); + } +} +void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(jp[0 * m]); + i3_0 = c_im(jp[0 * m]); + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r3_4 = ((wr * tmpr) - (wi * tmpi)); + i3_4 = ((wi * tmpr) + (wr * tmpi)); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r3_2 = ((wr * tmpr) - (wi * tmpi)); + i3_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r3_6 = ((wr * tmpr) - (wi * tmpi)); + i3_6 = ((wi * tmpr) + (wr * tmpi)); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r3_1 = ((wr * tmpr) - (wi * tmpi)); + i3_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r3_5 = ((wr * tmpr) - (wi * tmpi)); + i3_5 = ((wi * tmpr) + (wr * tmpi)); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r3_3 = ((wr * tmpr) - (wi * tmpi)); + i3_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r3_7 = ((wr * tmpr) - (wi * tmpi)); + i3_7 = ((wi * tmpr) + (wr * tmpi)); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[4 * m]) = (r1_0 - r1_1); + c_im(kp[4 * m]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[5 * m]) = (r1_2 - tmpr); + c_im(kp[5 * m]) = (i1_2 - tmpi); + c_re(kp[2 * m]) = (r1_4 + i1_5); + c_im(kp[2 * m]) = (i1_4 - r1_5); + c_re(kp[6 * m]) = (r1_4 - i1_5); + c_im(kp[6 * m]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 - tmpi); + c_re(kp[7 * m]) = (r1_6 - tmpr); + c_im(kp[7 * m]) = (i1_6 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m); + #pragma omp taskwait + } +} +void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + r3_0 = c_re(jp[0 * m]); + i3_0 = c_im(jp[0 * m]); + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r3_4 = ((wr * tmpr) - (wi * tmpi)); + i3_4 = ((wi * tmpr) + (wr * tmpi)); + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_4 = (r3_0 - r3_4); + i2_4 = (i3_0 - i3_4); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r3_2 = ((wr * tmpr) - (wi * tmpi)); + i3_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r3_6 = ((wr * tmpr) - (wi * tmpi)); + i3_6 = ((wi * tmpr) + (wr * tmpi)); + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_6 = (r3_2 - r3_6); + i2_6 = (i3_2 - i3_6); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_4 = (r2_0 - r2_2); + i1_4 = (i2_0 - i2_2); + r1_2 = (r2_4 + i2_6); + i1_2 = (i2_4 - r2_6); + r1_6 = (r2_4 - i2_6); + i1_6 = (i2_4 + r2_6); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r3_1 = ((wr * tmpr) - (wi * tmpi)); + i3_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r3_5 = ((wr * tmpr) - (wi * tmpi)); + i3_5 = ((wi * tmpr) + (wr * tmpi)); + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_5 = (r3_1 - r3_5); + i2_5 = (i3_1 - i3_5); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r3_3 = ((wr * tmpr) - (wi * tmpi)); + i3_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r3_7 = ((wr * tmpr) - (wi * tmpi)); + i3_7 = ((wi * tmpr) + (wr * tmpi)); + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_7 = (r3_3 - r3_7); + i2_7 = (i3_3 - i3_7); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_5 = (r2_1 - r2_3); + i1_5 = (i2_1 - i2_3); + r1_3 = (r2_5 + i2_7); + i1_3 = (i2_5 - r2_7); + r1_7 = (r2_5 - i2_7); + i1_7 = (i2_5 + r2_7); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[4 * m]) = (r1_0 - r1_1); + c_im(kp[4 * m]) = (i1_0 - i1_1); + tmpr = (0.707106781187 * (r1_3 + i1_3)); + tmpi = (0.707106781187 * (i1_3 - r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[5 * m]) = (r1_2 - tmpr); + c_im(kp[5 * m]) = (i1_2 - tmpi); + c_re(kp[2 * m]) = (r1_4 + i1_5); + c_im(kp[2 * m]) = (i1_4 - r1_5); + c_re(kp[6 * m]) = (r1_4 - i1_5); + c_im(kp[6 * m]) = (i1_4 + r1_5); + tmpr = (0.707106781187 * (i1_7 - r1_7)); + tmpi = (0.707106781187 * (r1_7 + i1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 - tmpi); + c_re(kp[7 * m]) = (r1_6 - tmpr); + c_im(kp[7 * m]) = (i1_6 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_8_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_8_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 8; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_unshuffle_8(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_8(ab, b, in, out, m); + #pragma omp taskwait + } +} +void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 8; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_8_seq(a, ab, in, out, m); + fft_unshuffle_8_seq(ab, b, in, out, m); + } +} +void fft_base_16(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(in[0]); + i4_0 = c_im(in[0]); + r4_8 = c_re(in[8]); + i4_8 = c_im(in[8]); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + r4_4 = c_re(in[4]); + i4_4 = c_im(in[4]); + r4_12 = c_re(in[12]); + i4_12 = c_im(in[12]); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + r4_2 = c_re(in[2]); + i4_2 = c_im(in[2]); + r4_10 = c_re(in[10]); + i4_10 = c_im(in[10]); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + r4_6 = c_re(in[6]); + i4_6 = c_im(in[6]); + r4_14 = c_re(in[14]); + i4_14 = c_im(in[14]); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + r4_1 = c_re(in[1]); + i4_1 = c_im(in[1]); + r4_9 = c_re(in[9]); + i4_9 = c_im(in[9]); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + r4_5 = c_re(in[5]); + i4_5 = c_im(in[5]); + r4_13 = c_re(in[13]); + i4_13 = c_im(in[13]); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + r4_3 = c_re(in[3]); + i4_3 = c_im(in[3]); + r4_11 = c_re(in[11]); + i4_11 = c_im(in[11]); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + r4_7 = c_re(in[7]); + i4_7 = c_im(in[7]); + r4_15 = c_re(in[15]); + i4_15 = c_im(in[15]); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[8]) = (r1_0 - r1_1); + c_im(out[8]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[9]) = (r1_2 - tmpr); + c_im(out[9]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[10]) = (r1_4 - tmpr); + c_im(out[10]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[11]) = (r1_6 - tmpr); + c_im(out[11]) = (i1_6 - tmpi); + c_re(out[4]) = (r1_8 + i1_9); + c_im(out[4]) = (i1_8 - r1_9); + c_re(out[12]) = (r1_8 - i1_9); + c_im(out[12]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 - tmpi); + c_re(out[13]) = (r1_10 - tmpr); + c_im(out[13]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 - tmpi); + c_re(out[14]) = (r1_12 - tmpr); + c_im(out[14]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 - tmpi); + c_re(out[15]) = (r1_14 - tmpr); + c_im(out[15]) = (i1_14 + tmpi); + } +} +void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(jp[0 * m]); + i4_0 = c_im(jp[0 * m]); + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r4_8 = ((wr * tmpr) - (wi * tmpi)); + i4_8 = ((wi * tmpr) + (wr * tmpi)); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r4_4 = ((wr * tmpr) - (wi * tmpi)); + i4_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r4_12 = ((wr * tmpr) - (wi * tmpi)); + i4_12 = ((wi * tmpr) + (wr * tmpi)); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r4_2 = ((wr * tmpr) - (wi * tmpi)); + i4_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r4_10 = ((wr * tmpr) - (wi * tmpi)); + i4_10 = ((wi * tmpr) + (wr * tmpi)); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r4_6 = ((wr * tmpr) - (wi * tmpi)); + i4_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r4_14 = ((wr * tmpr) - (wi * tmpi)); + i4_14 = ((wi * tmpr) + (wr * tmpi)); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r4_1 = ((wr * tmpr) - (wi * tmpi)); + i4_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r4_9 = ((wr * tmpr) - (wi * tmpi)); + i4_9 = ((wi * tmpr) + (wr * tmpi)); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r4_5 = ((wr * tmpr) - (wi * tmpi)); + i4_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r4_13 = ((wr * tmpr) - (wi * tmpi)); + i4_13 = ((wi * tmpr) + (wr * tmpi)); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r4_3 = ((wr * tmpr) - (wi * tmpi)); + i4_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r4_11 = ((wr * tmpr) - (wi * tmpi)); + i4_11 = ((wi * tmpr) + (wr * tmpi)); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r4_7 = ((wr * tmpr) - (wi * tmpi)); + i4_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r4_15 = ((wr * tmpr) - (wi * tmpi)); + i4_15 = ((wi * tmpr) + (wr * tmpi)); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[8 * m]) = (r1_0 - r1_1); + c_im(kp[8 * m]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[9 * m]) = (r1_2 - tmpr); + c_im(kp[9 * m]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[10 * m]) = (r1_4 - tmpr); + c_im(kp[10 * m]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[11 * m]) = (r1_6 - tmpr); + c_im(kp[11 * m]) = (i1_6 - tmpi); + c_re(kp[4 * m]) = (r1_8 + i1_9); + c_im(kp[4 * m]) = (i1_8 - r1_9); + c_re(kp[12 * m]) = (r1_8 - i1_9); + c_im(kp[12 * m]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 - tmpi); + c_re(kp[13 * m]) = (r1_10 - tmpr); + c_im(kp[13 * m]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 - tmpi); + c_re(kp[14 * m]) = (r1_12 - tmpr); + c_im(kp[14 * m]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 - tmpi); + c_re(kp[15 * m]) = (r1_14 - tmpr); + c_im(kp[15 * m]) = (i1_14 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m); + #pragma omp taskwait + } +} +void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + r4_0 = c_re(jp[0 * m]); + i4_0 = c_im(jp[0 * m]); + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r4_8 = ((wr * tmpr) - (wi * tmpi)); + i4_8 = ((wi * tmpr) + (wr * tmpi)); + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_8 = (r4_0 - r4_8); + i3_8 = (i4_0 - i4_8); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r4_4 = ((wr * tmpr) - (wi * tmpi)); + i4_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r4_12 = ((wr * tmpr) - (wi * tmpi)); + i4_12 = ((wi * tmpr) + (wr * tmpi)); + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_12 = (r4_4 - r4_12); + i3_12 = (i4_4 - i4_12); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_8 = (r3_0 - r3_4); + i2_8 = (i3_0 - i3_4); + r2_4 = (r3_8 + i3_12); + i2_4 = (i3_8 - r3_12); + r2_12 = (r3_8 - i3_12); + i2_12 = (i3_8 + r3_12); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r4_2 = ((wr * tmpr) - (wi * tmpi)); + i4_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r4_10 = ((wr * tmpr) - (wi * tmpi)); + i4_10 = ((wi * tmpr) + (wr * tmpi)); + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_10 = (r4_2 - r4_10); + i3_10 = (i4_2 - i4_10); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r4_6 = ((wr * tmpr) - (wi * tmpi)); + i4_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r4_14 = ((wr * tmpr) - (wi * tmpi)); + i4_14 = ((wi * tmpr) + (wr * tmpi)); + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_14 = (r4_6 - r4_14); + i3_14 = (i4_6 - i4_14); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_10 = (r3_2 - r3_6); + i2_10 = (i3_2 - i3_6); + r2_6 = (r3_10 + i3_14); + i2_6 = (i3_10 - r3_14); + r2_14 = (r3_10 - i3_14); + i2_14 = (i3_10 + r3_14); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_8 = (r2_0 - r2_2); + i1_8 = (i2_0 - i2_2); + tmpr = (0.707106781187 * (r2_6 + i2_6)); + tmpi = (0.707106781187 * (i2_6 - r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_10 = (r2_4 - tmpr); + i1_10 = (i2_4 - tmpi); + r1_4 = (r2_8 + i2_10); + i1_4 = (i2_8 - r2_10); + r1_12 = (r2_8 - i2_10); + i1_12 = (i2_8 + r2_10); + tmpr = (0.707106781187 * (i2_14 - r2_14)); + tmpi = (0.707106781187 * (r2_14 + i2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 - tmpi); + r1_14 = (r2_12 - tmpr); + i1_14 = (i2_12 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r4_1 = ((wr * tmpr) - (wi * tmpi)); + i4_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r4_9 = ((wr * tmpr) - (wi * tmpi)); + i4_9 = ((wi * tmpr) + (wr * tmpi)); + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_9 = (r4_1 - r4_9); + i3_9 = (i4_1 - i4_9); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r4_5 = ((wr * tmpr) - (wi * tmpi)); + i4_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r4_13 = ((wr * tmpr) - (wi * tmpi)); + i4_13 = ((wi * tmpr) + (wr * tmpi)); + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_13 = (r4_5 - r4_13); + i3_13 = (i4_5 - i4_13); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_9 = (r3_1 - r3_5); + i2_9 = (i3_1 - i3_5); + r2_5 = (r3_9 + i3_13); + i2_5 = (i3_9 - r3_13); + r2_13 = (r3_9 - i3_13); + i2_13 = (i3_9 + r3_13); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r4_3 = ((wr * tmpr) - (wi * tmpi)); + i4_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r4_11 = ((wr * tmpr) - (wi * tmpi)); + i4_11 = ((wi * tmpr) + (wr * tmpi)); + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_11 = (r4_3 - r4_11); + i3_11 = (i4_3 - i4_11); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r4_7 = ((wr * tmpr) - (wi * tmpi)); + i4_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r4_15 = ((wr * tmpr) - (wi * tmpi)); + i4_15 = ((wi * tmpr) + (wr * tmpi)); + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_15 = (r4_7 - r4_15); + i3_15 = (i4_7 - i4_15); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_11 = (r3_3 - r3_7); + i2_11 = (i3_3 - i3_7); + r2_7 = (r3_11 + i3_15); + i2_7 = (i3_11 - r3_15); + r2_15 = (r3_11 - i3_15); + i2_15 = (i3_11 + r3_15); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_9 = (r2_1 - r2_3); + i1_9 = (i2_1 - i2_3); + tmpr = (0.707106781187 * (r2_7 + i2_7)); + tmpi = (0.707106781187 * (i2_7 - r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_11 = (r2_5 - tmpr); + i1_11 = (i2_5 - tmpi); + r1_5 = (r2_9 + i2_11); + i1_5 = (i2_9 - r2_11); + r1_13 = (r2_9 - i2_11); + i1_13 = (i2_9 + r2_11); + tmpr = (0.707106781187 * (i2_15 - r2_15)); + tmpi = (0.707106781187 * (r2_15 + i2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 - tmpi); + r1_15 = (r2_13 - tmpr); + i1_15 = (i2_13 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[8 * m]) = (r1_0 - r1_1); + c_im(kp[8 * m]) = (i1_0 - i1_1); + tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3)); + tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[9 * m]) = (r1_2 - tmpr); + c_im(kp[9 * m]) = (i1_2 - tmpi); + tmpr = (0.707106781187 * (r1_5 + i1_5)); + tmpi = (0.707106781187 * (i1_5 - r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[10 * m]) = (r1_4 - tmpr); + c_im(kp[10 * m]) = (i1_4 - tmpi); + tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7)); + tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[11 * m]) = (r1_6 - tmpr); + c_im(kp[11 * m]) = (i1_6 - tmpi); + c_re(kp[4 * m]) = (r1_8 + i1_9); + c_im(kp[4 * m]) = (i1_8 - r1_9); + c_re(kp[12 * m]) = (r1_8 - i1_9); + c_im(kp[12 * m]) = (i1_8 + r1_9); + tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11)); + tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 - tmpi); + c_re(kp[13 * m]) = (r1_10 - tmpr); + c_im(kp[13 * m]) = (i1_10 + tmpi); + tmpr = (0.707106781187 * (i1_13 - r1_13)); + tmpi = (0.707106781187 * (r1_13 + i1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 - tmpi); + c_re(kp[14 * m]) = (r1_12 - tmpr); + c_im(kp[14 * m]) = (i1_12 + tmpi); + tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15)); + tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 - tmpi); + c_re(kp[15 * m]) = (r1_14 - tmpr); + c_im(kp[15 * m]) = (i1_14 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_16_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_16_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 16; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_unshuffle_16(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_16(ab, b, in, out, m); + #pragma omp taskwait + } +} +void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 16; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_16_seq(a, ab, in, out, m); + fft_unshuffle_16_seq(ab, b, in, out, m); + } +} +void fft_base_32(COMPLEX * in, COMPLEX * out) +{ + REAL tmpr, tmpi; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(in[0]); + i5_0 = c_im(in[0]); + r5_16 = c_re(in[16]); + i5_16 = c_im(in[16]); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + r5_8 = c_re(in[8]); + i5_8 = c_im(in[8]); + r5_24 = c_re(in[24]); + i5_24 = c_im(in[24]); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + r5_4 = c_re(in[4]); + i5_4 = c_im(in[4]); + r5_20 = c_re(in[20]); + i5_20 = c_im(in[20]); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + r5_12 = c_re(in[12]); + i5_12 = c_im(in[12]); + r5_28 = c_re(in[28]); + i5_28 = c_im(in[28]); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + r5_2 = c_re(in[2]); + i5_2 = c_im(in[2]); + r5_18 = c_re(in[18]); + i5_18 = c_im(in[18]); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + r5_10 = c_re(in[10]); + i5_10 = c_im(in[10]); + r5_26 = c_re(in[26]); + i5_26 = c_im(in[26]); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + r5_6 = c_re(in[6]); + i5_6 = c_im(in[6]); + r5_22 = c_re(in[22]); + i5_22 = c_im(in[22]); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + r5_14 = c_re(in[14]); + i5_14 = c_im(in[14]); + r5_30 = c_re(in[30]); + i5_30 = c_im(in[30]); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + r5_1 = c_re(in[1]); + i5_1 = c_im(in[1]); + r5_17 = c_re(in[17]); + i5_17 = c_im(in[17]); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + r5_9 = c_re(in[9]); + i5_9 = c_im(in[9]); + r5_25 = c_re(in[25]); + i5_25 = c_im(in[25]); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + r5_5 = c_re(in[5]); + i5_5 = c_im(in[5]); + r5_21 = c_re(in[21]); + i5_21 = c_im(in[21]); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + r5_13 = c_re(in[13]); + i5_13 = c_im(in[13]); + r5_29 = c_re(in[29]); + i5_29 = c_im(in[29]); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + r5_3 = c_re(in[3]); + i5_3 = c_im(in[3]); + r5_19 = c_re(in[19]); + i5_19 = c_im(in[19]); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + r5_11 = c_re(in[11]); + i5_11 = c_im(in[11]); + r5_27 = c_re(in[27]); + i5_27 = c_im(in[27]); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + r5_7 = c_re(in[7]); + i5_7 = c_im(in[7]); + r5_23 = c_re(in[23]); + i5_23 = c_im(in[23]); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + r5_15 = c_re(in[15]); + i5_15 = c_im(in[15]); + r5_31 = c_re(in[31]); + i5_31 = c_im(in[31]); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(out[0]) = (r1_0 + r1_1); + c_im(out[0]) = (i1_0 + i1_1); + c_re(out[16]) = (r1_0 - r1_1); + c_im(out[16]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(out[1]) = (r1_2 + tmpr); + c_im(out[1]) = (i1_2 + tmpi); + c_re(out[17]) = (r1_2 - tmpr); + c_im(out[17]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(out[2]) = (r1_4 + tmpr); + c_im(out[2]) = (i1_4 + tmpi); + c_re(out[18]) = (r1_4 - tmpr); + c_im(out[18]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(out[3]) = (r1_6 + tmpr); + c_im(out[3]) = (i1_6 + tmpi); + c_re(out[19]) = (r1_6 - tmpr); + c_im(out[19]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(out[4]) = (r1_8 + tmpr); + c_im(out[4]) = (i1_8 + tmpi); + c_re(out[20]) = (r1_8 - tmpr); + c_im(out[20]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(out[5]) = (r1_10 + tmpr); + c_im(out[5]) = (i1_10 + tmpi); + c_re(out[21]) = (r1_10 - tmpr); + c_im(out[21]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(out[6]) = (r1_12 + tmpr); + c_im(out[6]) = (i1_12 + tmpi); + c_re(out[22]) = (r1_12 - tmpr); + c_im(out[22]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(out[7]) = (r1_14 + tmpr); + c_im(out[7]) = (i1_14 + tmpi); + c_re(out[23]) = (r1_14 - tmpr); + c_im(out[23]) = (i1_14 - tmpi); + c_re(out[8]) = (r1_16 + i1_17); + c_im(out[8]) = (i1_16 - r1_17); + c_re(out[24]) = (r1_16 - i1_17); + c_im(out[24]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(out[9]) = (r1_18 + tmpr); + c_im(out[9]) = (i1_18 - tmpi); + c_re(out[25]) = (r1_18 - tmpr); + c_im(out[25]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(out[10]) = (r1_20 + tmpr); + c_im(out[10]) = (i1_20 - tmpi); + c_re(out[26]) = (r1_20 - tmpr); + c_im(out[26]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(out[11]) = (r1_22 + tmpr); + c_im(out[11]) = (i1_22 - tmpi); + c_re(out[27]) = (r1_22 - tmpr); + c_im(out[27]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(out[12]) = (r1_24 + tmpr); + c_im(out[12]) = (i1_24 - tmpi); + c_re(out[28]) = (r1_24 - tmpr); + c_im(out[28]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(out[13]) = (r1_26 + tmpr); + c_im(out[13]) = (i1_26 - tmpi); + c_re(out[29]) = (r1_26 - tmpr); + c_im(out[29]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(out[14]) = (r1_28 + tmpr); + c_im(out[14]) = (i1_28 - tmpi); + c_re(out[30]) = (r1_28 - tmpr); + c_im(out[30]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(out[15]) = (r1_30 + tmpr); + c_im(out[15]) = (i1_30 - tmpi); + c_re(out[31]) = (r1_30 - tmpr); + c_im(out[31]) = (i1_30 + tmpi); + } +} +void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(jp[0 * m]); + i5_0 = c_im(jp[0 * m]); + wr = c_re(W[16 * l1]); + wi = c_im(W[16 * l1]); + tmpr = c_re(jp[16 * m]); + tmpi = c_im(jp[16 * m]); + r5_16 = ((wr * tmpr) - (wi * tmpi)); + i5_16 = ((wi * tmpr) + (wr * tmpi)); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r5_8 = ((wr * tmpr) - (wi * tmpi)); + i5_8 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[24 * l1]); + wi = c_im(W[24 * l1]); + tmpr = c_re(jp[24 * m]); + tmpi = c_im(jp[24 * m]); + r5_24 = ((wr * tmpr) - (wi * tmpi)); + i5_24 = ((wi * tmpr) + (wr * tmpi)); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r5_4 = ((wr * tmpr) - (wi * tmpi)); + i5_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[20 * l1]); + wi = c_im(W[20 * l1]); + tmpr = c_re(jp[20 * m]); + tmpi = c_im(jp[20 * m]); + r5_20 = ((wr * tmpr) - (wi * tmpi)); + i5_20 = ((wi * tmpr) + (wr * tmpi)); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r5_12 = ((wr * tmpr) - (wi * tmpi)); + i5_12 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[28 * l1]); + wi = c_im(W[28 * l1]); + tmpr = c_re(jp[28 * m]); + tmpi = c_im(jp[28 * m]); + r5_28 = ((wr * tmpr) - (wi * tmpi)); + i5_28 = ((wi * tmpr) + (wr * tmpi)); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r5_2 = ((wr * tmpr) - (wi * tmpi)); + i5_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[18 * l1]); + wi = c_im(W[18 * l1]); + tmpr = c_re(jp[18 * m]); + tmpi = c_im(jp[18 * m]); + r5_18 = ((wr * tmpr) - (wi * tmpi)); + i5_18 = ((wi * tmpr) + (wr * tmpi)); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r5_10 = ((wr * tmpr) - (wi * tmpi)); + i5_10 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[26 * l1]); + wi = c_im(W[26 * l1]); + tmpr = c_re(jp[26 * m]); + tmpi = c_im(jp[26 * m]); + r5_26 = ((wr * tmpr) - (wi * tmpi)); + i5_26 = ((wi * tmpr) + (wr * tmpi)); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r5_6 = ((wr * tmpr) - (wi * tmpi)); + i5_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[22 * l1]); + wi = c_im(W[22 * l1]); + tmpr = c_re(jp[22 * m]); + tmpi = c_im(jp[22 * m]); + r5_22 = ((wr * tmpr) - (wi * tmpi)); + i5_22 = ((wi * tmpr) + (wr * tmpi)); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r5_14 = ((wr * tmpr) - (wi * tmpi)); + i5_14 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[30 * l1]); + wi = c_im(W[30 * l1]); + tmpr = c_re(jp[30 * m]); + tmpi = c_im(jp[30 * m]); + r5_30 = ((wr * tmpr) - (wi * tmpi)); + i5_30 = ((wi * tmpr) + (wr * tmpi)); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r5_1 = ((wr * tmpr) - (wi * tmpi)); + i5_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[17 * l1]); + wi = c_im(W[17 * l1]); + tmpr = c_re(jp[17 * m]); + tmpi = c_im(jp[17 * m]); + r5_17 = ((wr * tmpr) - (wi * tmpi)); + i5_17 = ((wi * tmpr) + (wr * tmpi)); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r5_9 = ((wr * tmpr) - (wi * tmpi)); + i5_9 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[25 * l1]); + wi = c_im(W[25 * l1]); + tmpr = c_re(jp[25 * m]); + tmpi = c_im(jp[25 * m]); + r5_25 = ((wr * tmpr) - (wi * tmpi)); + i5_25 = ((wi * tmpr) + (wr * tmpi)); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r5_5 = ((wr * tmpr) - (wi * tmpi)); + i5_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[21 * l1]); + wi = c_im(W[21 * l1]); + tmpr = c_re(jp[21 * m]); + tmpi = c_im(jp[21 * m]); + r5_21 = ((wr * tmpr) - (wi * tmpi)); + i5_21 = ((wi * tmpr) + (wr * tmpi)); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r5_13 = ((wr * tmpr) - (wi * tmpi)); + i5_13 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[29 * l1]); + wi = c_im(W[29 * l1]); + tmpr = c_re(jp[29 * m]); + tmpi = c_im(jp[29 * m]); + r5_29 = ((wr * tmpr) - (wi * tmpi)); + i5_29 = ((wi * tmpr) + (wr * tmpi)); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r5_3 = ((wr * tmpr) - (wi * tmpi)); + i5_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[19 * l1]); + wi = c_im(W[19 * l1]); + tmpr = c_re(jp[19 * m]); + tmpi = c_im(jp[19 * m]); + r5_19 = ((wr * tmpr) - (wi * tmpi)); + i5_19 = ((wi * tmpr) + (wr * tmpi)); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r5_11 = ((wr * tmpr) - (wi * tmpi)); + i5_11 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[27 * l1]); + wi = c_im(W[27 * l1]); + tmpr = c_re(jp[27 * m]); + tmpi = c_im(jp[27 * m]); + r5_27 = ((wr * tmpr) - (wi * tmpi)); + i5_27 = ((wi * tmpr) + (wr * tmpi)); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r5_7 = ((wr * tmpr) - (wi * tmpi)); + i5_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[23 * l1]); + wi = c_im(W[23 * l1]); + tmpr = c_re(jp[23 * m]); + tmpi = c_im(jp[23 * m]); + r5_23 = ((wr * tmpr) - (wi * tmpi)); + i5_23 = ((wi * tmpr) + (wr * tmpi)); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r5_15 = ((wr * tmpr) - (wi * tmpi)); + i5_15 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[31 * l1]); + wi = c_im(W[31 * l1]); + tmpr = c_re(jp[31 * m]); + tmpi = c_im(jp[31 * m]); + r5_31 = ((wr * tmpr) - (wi * tmpi)); + i5_31 = ((wi * tmpr) + (wr * tmpi)); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[16 * m]) = (r1_0 - r1_1); + c_im(kp[16 * m]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[17 * m]) = (r1_2 - tmpr); + c_im(kp[17 * m]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[18 * m]) = (r1_4 - tmpr); + c_im(kp[18 * m]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[19 * m]) = (r1_6 - tmpr); + c_im(kp[19 * m]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(kp[4 * m]) = (r1_8 + tmpr); + c_im(kp[4 * m]) = (i1_8 + tmpi); + c_re(kp[20 * m]) = (r1_8 - tmpr); + c_im(kp[20 * m]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 + tmpi); + c_re(kp[21 * m]) = (r1_10 - tmpr); + c_im(kp[21 * m]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 + tmpi); + c_re(kp[22 * m]) = (r1_12 - tmpr); + c_im(kp[22 * m]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 + tmpi); + c_re(kp[23 * m]) = (r1_14 - tmpr); + c_im(kp[23 * m]) = (i1_14 - tmpi); + c_re(kp[8 * m]) = (r1_16 + i1_17); + c_im(kp[8 * m]) = (i1_16 - r1_17); + c_re(kp[24 * m]) = (r1_16 - i1_17); + c_im(kp[24 * m]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(kp[9 * m]) = (r1_18 + tmpr); + c_im(kp[9 * m]) = (i1_18 - tmpi); + c_re(kp[25 * m]) = (r1_18 - tmpr); + c_im(kp[25 * m]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(kp[10 * m]) = (r1_20 + tmpr); + c_im(kp[10 * m]) = (i1_20 - tmpi); + c_re(kp[26 * m]) = (r1_20 - tmpr); + c_im(kp[26 * m]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(kp[11 * m]) = (r1_22 + tmpr); + c_im(kp[11 * m]) = (i1_22 - tmpi); + c_re(kp[27 * m]) = (r1_22 - tmpr); + c_im(kp[27 * m]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(kp[12 * m]) = (r1_24 + tmpr); + c_im(kp[12 * m]) = (i1_24 - tmpi); + c_re(kp[28 * m]) = (r1_24 - tmpr); + c_im(kp[28 * m]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(kp[13 * m]) = (r1_26 + tmpr); + c_im(kp[13 * m]) = (i1_26 - tmpi); + c_re(kp[29 * m]) = (r1_26 - tmpr); + c_im(kp[29 * m]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(kp[14 * m]) = (r1_28 + tmpr); + c_im(kp[14 * m]) = (i1_28 - tmpi); + c_re(kp[30 * m]) = (r1_28 - tmpr); + c_im(kp[30 * m]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(kp[15 * m]) = (r1_30 + tmpr); + c_im(kp[15 * m]) = (i1_30 - tmpi); + c_re(kp[31 * m]) = (r1_30 - tmpr); + c_im(kp[31 * m]) = (i1_30 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m); + #pragma omp task untied + fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m); + #pragma omp taskwait + } +} +void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m) +{ + int l1, i; + COMPLEX *jp, *kp; + REAL tmpr, tmpi, wr, wi; + if ((b - a) < 128) { + for (i = a, l1 = nWdn * i, kp = out + i; i < b; + i++, l1 += nWdn, kp++) { + jp = in + i; + { + REAL r1_0, i1_0; + REAL r1_1, i1_1; + REAL r1_2, i1_2; + REAL r1_3, i1_3; + REAL r1_4, i1_4; + REAL r1_5, i1_5; + REAL r1_6, i1_6; + REAL r1_7, i1_7; + REAL r1_8, i1_8; + REAL r1_9, i1_9; + REAL r1_10, i1_10; + REAL r1_11, i1_11; + REAL r1_12, i1_12; + REAL r1_13, i1_13; + REAL r1_14, i1_14; + REAL r1_15, i1_15; + REAL r1_16, i1_16; + REAL r1_17, i1_17; + REAL r1_18, i1_18; + REAL r1_19, i1_19; + REAL r1_20, i1_20; + REAL r1_21, i1_21; + REAL r1_22, i1_22; + REAL r1_23, i1_23; + REAL r1_24, i1_24; + REAL r1_25, i1_25; + REAL r1_26, i1_26; + REAL r1_27, i1_27; + REAL r1_28, i1_28; + REAL r1_29, i1_29; + REAL r1_30, i1_30; + REAL r1_31, i1_31; + { + REAL r2_0, i2_0; + REAL r2_2, i2_2; + REAL r2_4, i2_4; + REAL r2_6, i2_6; + REAL r2_8, i2_8; + REAL r2_10, i2_10; + REAL r2_12, i2_12; + REAL r2_14, i2_14; + REAL r2_16, i2_16; + REAL r2_18, i2_18; + REAL r2_20, i2_20; + REAL r2_22, i2_22; + REAL r2_24, i2_24; + REAL r2_26, i2_26; + REAL r2_28, i2_28; + REAL r2_30, i2_30; + { + REAL r3_0, i3_0; + REAL r3_4, i3_4; + REAL r3_8, i3_8; + REAL r3_12, i3_12; + REAL r3_16, i3_16; + REAL r3_20, i3_20; + REAL r3_24, i3_24; + REAL r3_28, i3_28; + { + REAL r4_0, i4_0; + REAL r4_8, i4_8; + REAL r4_16, i4_16; + REAL r4_24, i4_24; + { + REAL r5_0, i5_0; + REAL r5_16, i5_16; + r5_0 = c_re(jp[0 * m]); + i5_0 = c_im(jp[0 * m]); + wr = c_re(W[16 * l1]); + wi = c_im(W[16 * l1]); + tmpr = c_re(jp[16 * m]); + tmpi = c_im(jp[16 * m]); + r5_16 = ((wr * tmpr) - (wi * tmpi)); + i5_16 = ((wi * tmpr) + (wr * tmpi)); + r4_0 = (r5_0 + r5_16); + i4_0 = (i5_0 + i5_16); + r4_16 = (r5_0 - r5_16); + i4_16 = (i5_0 - i5_16); + } + { + REAL r5_8, i5_8; + REAL r5_24, i5_24; + wr = c_re(W[8 * l1]); + wi = c_im(W[8 * l1]); + tmpr = c_re(jp[8 * m]); + tmpi = c_im(jp[8 * m]); + r5_8 = ((wr * tmpr) - (wi * tmpi)); + i5_8 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[24 * l1]); + wi = c_im(W[24 * l1]); + tmpr = c_re(jp[24 * m]); + tmpi = c_im(jp[24 * m]); + r5_24 = ((wr * tmpr) - (wi * tmpi)); + i5_24 = ((wi * tmpr) + (wr * tmpi)); + r4_8 = (r5_8 + r5_24); + i4_8 = (i5_8 + i5_24); + r4_24 = (r5_8 - r5_24); + i4_24 = (i5_8 - i5_24); + } + r3_0 = (r4_0 + r4_8); + i3_0 = (i4_0 + i4_8); + r3_16 = (r4_0 - r4_8); + i3_16 = (i4_0 - i4_8); + r3_8 = (r4_16 + i4_24); + i3_8 = (i4_16 - r4_24); + r3_24 = (r4_16 - i4_24); + i3_24 = (i4_16 + r4_24); + } + { + REAL r4_4, i4_4; + REAL r4_12, i4_12; + REAL r4_20, i4_20; + REAL r4_28, i4_28; + { + REAL r5_4, i5_4; + REAL r5_20, i5_20; + wr = c_re(W[4 * l1]); + wi = c_im(W[4 * l1]); + tmpr = c_re(jp[4 * m]); + tmpi = c_im(jp[4 * m]); + r5_4 = ((wr * tmpr) - (wi * tmpi)); + i5_4 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[20 * l1]); + wi = c_im(W[20 * l1]); + tmpr = c_re(jp[20 * m]); + tmpi = c_im(jp[20 * m]); + r5_20 = ((wr * tmpr) - (wi * tmpi)); + i5_20 = ((wi * tmpr) + (wr * tmpi)); + r4_4 = (r5_4 + r5_20); + i4_4 = (i5_4 + i5_20); + r4_20 = (r5_4 - r5_20); + i4_20 = (i5_4 - i5_20); + } + { + REAL r5_12, i5_12; + REAL r5_28, i5_28; + wr = c_re(W[12 * l1]); + wi = c_im(W[12 * l1]); + tmpr = c_re(jp[12 * m]); + tmpi = c_im(jp[12 * m]); + r5_12 = ((wr * tmpr) - (wi * tmpi)); + i5_12 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[28 * l1]); + wi = c_im(W[28 * l1]); + tmpr = c_re(jp[28 * m]); + tmpi = c_im(jp[28 * m]); + r5_28 = ((wr * tmpr) - (wi * tmpi)); + i5_28 = ((wi * tmpr) + (wr * tmpi)); + r4_12 = (r5_12 + r5_28); + i4_12 = (i5_12 + i5_28); + r4_28 = (r5_12 - r5_28); + i4_28 = (i5_12 - i5_28); + } + r3_4 = (r4_4 + r4_12); + i3_4 = (i4_4 + i4_12); + r3_20 = (r4_4 - r4_12); + i3_20 = (i4_4 - i4_12); + r3_12 = (r4_20 + i4_28); + i3_12 = (i4_20 - r4_28); + r3_28 = (r4_20 - i4_28); + i3_28 = (i4_20 + r4_28); + } + r2_0 = (r3_0 + r3_4); + i2_0 = (i3_0 + i3_4); + r2_16 = (r3_0 - r3_4); + i2_16 = (i3_0 - i3_4); + tmpr = (0.707106781187 * (r3_12 + i3_12)); + tmpi = (0.707106781187 * (i3_12 - r3_12)); + r2_4 = (r3_8 + tmpr); + i2_4 = (i3_8 + tmpi); + r2_20 = (r3_8 - tmpr); + i2_20 = (i3_8 - tmpi); + r2_8 = (r3_16 + i3_20); + i2_8 = (i3_16 - r3_20); + r2_24 = (r3_16 - i3_20); + i2_24 = (i3_16 + r3_20); + tmpr = (0.707106781187 * (i3_28 - r3_28)); + tmpi = (0.707106781187 * (r3_28 + i3_28)); + r2_12 = (r3_24 + tmpr); + i2_12 = (i3_24 - tmpi); + r2_28 = (r3_24 - tmpr); + i2_28 = (i3_24 + tmpi); + } + { + REAL r3_2, i3_2; + REAL r3_6, i3_6; + REAL r3_10, i3_10; + REAL r3_14, i3_14; + REAL r3_18, i3_18; + REAL r3_22, i3_22; + REAL r3_26, i3_26; + REAL r3_30, i3_30; + { + REAL r4_2, i4_2; + REAL r4_10, i4_10; + REAL r4_18, i4_18; + REAL r4_26, i4_26; + { + REAL r5_2, i5_2; + REAL r5_18, i5_18; + wr = c_re(W[2 * l1]); + wi = c_im(W[2 * l1]); + tmpr = c_re(jp[2 * m]); + tmpi = c_im(jp[2 * m]); + r5_2 = ((wr * tmpr) - (wi * tmpi)); + i5_2 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[18 * l1]); + wi = c_im(W[18 * l1]); + tmpr = c_re(jp[18 * m]); + tmpi = c_im(jp[18 * m]); + r5_18 = ((wr * tmpr) - (wi * tmpi)); + i5_18 = ((wi * tmpr) + (wr * tmpi)); + r4_2 = (r5_2 + r5_18); + i4_2 = (i5_2 + i5_18); + r4_18 = (r5_2 - r5_18); + i4_18 = (i5_2 - i5_18); + } + { + REAL r5_10, i5_10; + REAL r5_26, i5_26; + wr = c_re(W[10 * l1]); + wi = c_im(W[10 * l1]); + tmpr = c_re(jp[10 * m]); + tmpi = c_im(jp[10 * m]); + r5_10 = ((wr * tmpr) - (wi * tmpi)); + i5_10 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[26 * l1]); + wi = c_im(W[26 * l1]); + tmpr = c_re(jp[26 * m]); + tmpi = c_im(jp[26 * m]); + r5_26 = ((wr * tmpr) - (wi * tmpi)); + i5_26 = ((wi * tmpr) + (wr * tmpi)); + r4_10 = (r5_10 + r5_26); + i4_10 = (i5_10 + i5_26); + r4_26 = (r5_10 - r5_26); + i4_26 = (i5_10 - i5_26); + } + r3_2 = (r4_2 + r4_10); + i3_2 = (i4_2 + i4_10); + r3_18 = (r4_2 - r4_10); + i3_18 = (i4_2 - i4_10); + r3_10 = (r4_18 + i4_26); + i3_10 = (i4_18 - r4_26); + r3_26 = (r4_18 - i4_26); + i3_26 = (i4_18 + r4_26); + } + { + REAL r4_6, i4_6; + REAL r4_14, i4_14; + REAL r4_22, i4_22; + REAL r4_30, i4_30; + { + REAL r5_6, i5_6; + REAL r5_22, i5_22; + wr = c_re(W[6 * l1]); + wi = c_im(W[6 * l1]); + tmpr = c_re(jp[6 * m]); + tmpi = c_im(jp[6 * m]); + r5_6 = ((wr * tmpr) - (wi * tmpi)); + i5_6 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[22 * l1]); + wi = c_im(W[22 * l1]); + tmpr = c_re(jp[22 * m]); + tmpi = c_im(jp[22 * m]); + r5_22 = ((wr * tmpr) - (wi * tmpi)); + i5_22 = ((wi * tmpr) + (wr * tmpi)); + r4_6 = (r5_6 + r5_22); + i4_6 = (i5_6 + i5_22); + r4_22 = (r5_6 - r5_22); + i4_22 = (i5_6 - i5_22); + } + { + REAL r5_14, i5_14; + REAL r5_30, i5_30; + wr = c_re(W[14 * l1]); + wi = c_im(W[14 * l1]); + tmpr = c_re(jp[14 * m]); + tmpi = c_im(jp[14 * m]); + r5_14 = ((wr * tmpr) - (wi * tmpi)); + i5_14 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[30 * l1]); + wi = c_im(W[30 * l1]); + tmpr = c_re(jp[30 * m]); + tmpi = c_im(jp[30 * m]); + r5_30 = ((wr * tmpr) - (wi * tmpi)); + i5_30 = ((wi * tmpr) + (wr * tmpi)); + r4_14 = (r5_14 + r5_30); + i4_14 = (i5_14 + i5_30); + r4_30 = (r5_14 - r5_30); + i4_30 = (i5_14 - i5_30); + } + r3_6 = (r4_6 + r4_14); + i3_6 = (i4_6 + i4_14); + r3_22 = (r4_6 - r4_14); + i3_22 = (i4_6 - i4_14); + r3_14 = (r4_22 + i4_30); + i3_14 = (i4_22 - r4_30); + r3_30 = (r4_22 - i4_30); + i3_30 = (i4_22 + r4_30); + } + r2_2 = (r3_2 + r3_6); + i2_2 = (i3_2 + i3_6); + r2_18 = (r3_2 - r3_6); + i2_18 = (i3_2 - i3_6); + tmpr = (0.707106781187 * (r3_14 + i3_14)); + tmpi = (0.707106781187 * (i3_14 - r3_14)); + r2_6 = (r3_10 + tmpr); + i2_6 = (i3_10 + tmpi); + r2_22 = (r3_10 - tmpr); + i2_22 = (i3_10 - tmpi); + r2_10 = (r3_18 + i3_22); + i2_10 = (i3_18 - r3_22); + r2_26 = (r3_18 - i3_22); + i2_26 = (i3_18 + r3_22); + tmpr = (0.707106781187 * (i3_30 - r3_30)); + tmpi = (0.707106781187 * (r3_30 + i3_30)); + r2_14 = (r3_26 + tmpr); + i2_14 = (i3_26 - tmpi); + r2_30 = (r3_26 - tmpr); + i2_30 = (i3_26 + tmpi); + } + r1_0 = (r2_0 + r2_2); + i1_0 = (i2_0 + i2_2); + r1_16 = (r2_0 - r2_2); + i1_16 = (i2_0 - i2_2); + tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6)); + tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6)); + r1_2 = (r2_4 + tmpr); + i1_2 = (i2_4 + tmpi); + r1_18 = (r2_4 - tmpr); + i1_18 = (i2_4 - tmpi); + tmpr = (0.707106781187 * (r2_10 + i2_10)); + tmpi = (0.707106781187 * (i2_10 - r2_10)); + r1_4 = (r2_8 + tmpr); + i1_4 = (i2_8 + tmpi); + r1_20 = (r2_8 - tmpr); + i1_20 = (i2_8 - tmpi); + tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14)); + tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14)); + r1_6 = (r2_12 + tmpr); + i1_6 = (i2_12 + tmpi); + r1_22 = (r2_12 - tmpr); + i1_22 = (i2_12 - tmpi); + r1_8 = (r2_16 + i2_18); + i1_8 = (i2_16 - r2_18); + r1_24 = (r2_16 - i2_18); + i1_24 = (i2_16 + r2_18); + tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22)); + tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22)); + r1_10 = (r2_20 + tmpr); + i1_10 = (i2_20 - tmpi); + r1_26 = (r2_20 - tmpr); + i1_26 = (i2_20 + tmpi); + tmpr = (0.707106781187 * (i2_26 - r2_26)); + tmpi = (0.707106781187 * (r2_26 + i2_26)); + r1_12 = (r2_24 + tmpr); + i1_12 = (i2_24 - tmpi); + r1_28 = (r2_24 - tmpr); + i1_28 = (i2_24 + tmpi); + tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30)); + tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30)); + r1_14 = (r2_28 + tmpr); + i1_14 = (i2_28 - tmpi); + r1_30 = (r2_28 - tmpr); + i1_30 = (i2_28 + tmpi); + } + { + REAL r2_1, i2_1; + REAL r2_3, i2_3; + REAL r2_5, i2_5; + REAL r2_7, i2_7; + REAL r2_9, i2_9; + REAL r2_11, i2_11; + REAL r2_13, i2_13; + REAL r2_15, i2_15; + REAL r2_17, i2_17; + REAL r2_19, i2_19; + REAL r2_21, i2_21; + REAL r2_23, i2_23; + REAL r2_25, i2_25; + REAL r2_27, i2_27; + REAL r2_29, i2_29; + REAL r2_31, i2_31; + { + REAL r3_1, i3_1; + REAL r3_5, i3_5; + REAL r3_9, i3_9; + REAL r3_13, i3_13; + REAL r3_17, i3_17; + REAL r3_21, i3_21; + REAL r3_25, i3_25; + REAL r3_29, i3_29; + { + REAL r4_1, i4_1; + REAL r4_9, i4_9; + REAL r4_17, i4_17; + REAL r4_25, i4_25; + { + REAL r5_1, i5_1; + REAL r5_17, i5_17; + wr = c_re(W[1 * l1]); + wi = c_im(W[1 * l1]); + tmpr = c_re(jp[1 * m]); + tmpi = c_im(jp[1 * m]); + r5_1 = ((wr * tmpr) - (wi * tmpi)); + i5_1 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[17 * l1]); + wi = c_im(W[17 * l1]); + tmpr = c_re(jp[17 * m]); + tmpi = c_im(jp[17 * m]); + r5_17 = ((wr * tmpr) - (wi * tmpi)); + i5_17 = ((wi * tmpr) + (wr * tmpi)); + r4_1 = (r5_1 + r5_17); + i4_1 = (i5_1 + i5_17); + r4_17 = (r5_1 - r5_17); + i4_17 = (i5_1 - i5_17); + } + { + REAL r5_9, i5_9; + REAL r5_25, i5_25; + wr = c_re(W[9 * l1]); + wi = c_im(W[9 * l1]); + tmpr = c_re(jp[9 * m]); + tmpi = c_im(jp[9 * m]); + r5_9 = ((wr * tmpr) - (wi * tmpi)); + i5_9 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[25 * l1]); + wi = c_im(W[25 * l1]); + tmpr = c_re(jp[25 * m]); + tmpi = c_im(jp[25 * m]); + r5_25 = ((wr * tmpr) - (wi * tmpi)); + i5_25 = ((wi * tmpr) + (wr * tmpi)); + r4_9 = (r5_9 + r5_25); + i4_9 = (i5_9 + i5_25); + r4_25 = (r5_9 - r5_25); + i4_25 = (i5_9 - i5_25); + } + r3_1 = (r4_1 + r4_9); + i3_1 = (i4_1 + i4_9); + r3_17 = (r4_1 - r4_9); + i3_17 = (i4_1 - i4_9); + r3_9 = (r4_17 + i4_25); + i3_9 = (i4_17 - r4_25); + r3_25 = (r4_17 - i4_25); + i3_25 = (i4_17 + r4_25); + } + { + REAL r4_5, i4_5; + REAL r4_13, i4_13; + REAL r4_21, i4_21; + REAL r4_29, i4_29; + { + REAL r5_5, i5_5; + REAL r5_21, i5_21; + wr = c_re(W[5 * l1]); + wi = c_im(W[5 * l1]); + tmpr = c_re(jp[5 * m]); + tmpi = c_im(jp[5 * m]); + r5_5 = ((wr * tmpr) - (wi * tmpi)); + i5_5 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[21 * l1]); + wi = c_im(W[21 * l1]); + tmpr = c_re(jp[21 * m]); + tmpi = c_im(jp[21 * m]); + r5_21 = ((wr * tmpr) - (wi * tmpi)); + i5_21 = ((wi * tmpr) + (wr * tmpi)); + r4_5 = (r5_5 + r5_21); + i4_5 = (i5_5 + i5_21); + r4_21 = (r5_5 - r5_21); + i4_21 = (i5_5 - i5_21); + } + { + REAL r5_13, i5_13; + REAL r5_29, i5_29; + wr = c_re(W[13 * l1]); + wi = c_im(W[13 * l1]); + tmpr = c_re(jp[13 * m]); + tmpi = c_im(jp[13 * m]); + r5_13 = ((wr * tmpr) - (wi * tmpi)); + i5_13 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[29 * l1]); + wi = c_im(W[29 * l1]); + tmpr = c_re(jp[29 * m]); + tmpi = c_im(jp[29 * m]); + r5_29 = ((wr * tmpr) - (wi * tmpi)); + i5_29 = ((wi * tmpr) + (wr * tmpi)); + r4_13 = (r5_13 + r5_29); + i4_13 = (i5_13 + i5_29); + r4_29 = (r5_13 - r5_29); + i4_29 = (i5_13 - i5_29); + } + r3_5 = (r4_5 + r4_13); + i3_5 = (i4_5 + i4_13); + r3_21 = (r4_5 - r4_13); + i3_21 = (i4_5 - i4_13); + r3_13 = (r4_21 + i4_29); + i3_13 = (i4_21 - r4_29); + r3_29 = (r4_21 - i4_29); + i3_29 = (i4_21 + r4_29); + } + r2_1 = (r3_1 + r3_5); + i2_1 = (i3_1 + i3_5); + r2_17 = (r3_1 - r3_5); + i2_17 = (i3_1 - i3_5); + tmpr = (0.707106781187 * (r3_13 + i3_13)); + tmpi = (0.707106781187 * (i3_13 - r3_13)); + r2_5 = (r3_9 + tmpr); + i2_5 = (i3_9 + tmpi); + r2_21 = (r3_9 - tmpr); + i2_21 = (i3_9 - tmpi); + r2_9 = (r3_17 + i3_21); + i2_9 = (i3_17 - r3_21); + r2_25 = (r3_17 - i3_21); + i2_25 = (i3_17 + r3_21); + tmpr = (0.707106781187 * (i3_29 - r3_29)); + tmpi = (0.707106781187 * (r3_29 + i3_29)); + r2_13 = (r3_25 + tmpr); + i2_13 = (i3_25 - tmpi); + r2_29 = (r3_25 - tmpr); + i2_29 = (i3_25 + tmpi); + } + { + REAL r3_3, i3_3; + REAL r3_7, i3_7; + REAL r3_11, i3_11; + REAL r3_15, i3_15; + REAL r3_19, i3_19; + REAL r3_23, i3_23; + REAL r3_27, i3_27; + REAL r3_31, i3_31; + { + REAL r4_3, i4_3; + REAL r4_11, i4_11; + REAL r4_19, i4_19; + REAL r4_27, i4_27; + { + REAL r5_3, i5_3; + REAL r5_19, i5_19; + wr = c_re(W[3 * l1]); + wi = c_im(W[3 * l1]); + tmpr = c_re(jp[3 * m]); + tmpi = c_im(jp[3 * m]); + r5_3 = ((wr * tmpr) - (wi * tmpi)); + i5_3 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[19 * l1]); + wi = c_im(W[19 * l1]); + tmpr = c_re(jp[19 * m]); + tmpi = c_im(jp[19 * m]); + r5_19 = ((wr * tmpr) - (wi * tmpi)); + i5_19 = ((wi * tmpr) + (wr * tmpi)); + r4_3 = (r5_3 + r5_19); + i4_3 = (i5_3 + i5_19); + r4_19 = (r5_3 - r5_19); + i4_19 = (i5_3 - i5_19); + } + { + REAL r5_11, i5_11; + REAL r5_27, i5_27; + wr = c_re(W[11 * l1]); + wi = c_im(W[11 * l1]); + tmpr = c_re(jp[11 * m]); + tmpi = c_im(jp[11 * m]); + r5_11 = ((wr * tmpr) - (wi * tmpi)); + i5_11 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[27 * l1]); + wi = c_im(W[27 * l1]); + tmpr = c_re(jp[27 * m]); + tmpi = c_im(jp[27 * m]); + r5_27 = ((wr * tmpr) - (wi * tmpi)); + i5_27 = ((wi * tmpr) + (wr * tmpi)); + r4_11 = (r5_11 + r5_27); + i4_11 = (i5_11 + i5_27); + r4_27 = (r5_11 - r5_27); + i4_27 = (i5_11 - i5_27); + } + r3_3 = (r4_3 + r4_11); + i3_3 = (i4_3 + i4_11); + r3_19 = (r4_3 - r4_11); + i3_19 = (i4_3 - i4_11); + r3_11 = (r4_19 + i4_27); + i3_11 = (i4_19 - r4_27); + r3_27 = (r4_19 - i4_27); + i3_27 = (i4_19 + r4_27); + } + { + REAL r4_7, i4_7; + REAL r4_15, i4_15; + REAL r4_23, i4_23; + REAL r4_31, i4_31; + { + REAL r5_7, i5_7; + REAL r5_23, i5_23; + wr = c_re(W[7 * l1]); + wi = c_im(W[7 * l1]); + tmpr = c_re(jp[7 * m]); + tmpi = c_im(jp[7 * m]); + r5_7 = ((wr * tmpr) - (wi * tmpi)); + i5_7 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[23 * l1]); + wi = c_im(W[23 * l1]); + tmpr = c_re(jp[23 * m]); + tmpi = c_im(jp[23 * m]); + r5_23 = ((wr * tmpr) - (wi * tmpi)); + i5_23 = ((wi * tmpr) + (wr * tmpi)); + r4_7 = (r5_7 + r5_23); + i4_7 = (i5_7 + i5_23); + r4_23 = (r5_7 - r5_23); + i4_23 = (i5_7 - i5_23); + } + { + REAL r5_15, i5_15; + REAL r5_31, i5_31; + wr = c_re(W[15 * l1]); + wi = c_im(W[15 * l1]); + tmpr = c_re(jp[15 * m]); + tmpi = c_im(jp[15 * m]); + r5_15 = ((wr * tmpr) - (wi * tmpi)); + i5_15 = ((wi * tmpr) + (wr * tmpi)); + wr = c_re(W[31 * l1]); + wi = c_im(W[31 * l1]); + tmpr = c_re(jp[31 * m]); + tmpi = c_im(jp[31 * m]); + r5_31 = ((wr * tmpr) - (wi * tmpi)); + i5_31 = ((wi * tmpr) + (wr * tmpi)); + r4_15 = (r5_15 + r5_31); + i4_15 = (i5_15 + i5_31); + r4_31 = (r5_15 - r5_31); + i4_31 = (i5_15 - i5_31); + } + r3_7 = (r4_7 + r4_15); + i3_7 = (i4_7 + i4_15); + r3_23 = (r4_7 - r4_15); + i3_23 = (i4_7 - i4_15); + r3_15 = (r4_23 + i4_31); + i3_15 = (i4_23 - r4_31); + r3_31 = (r4_23 - i4_31); + i3_31 = (i4_23 + r4_31); + } + r2_3 = (r3_3 + r3_7); + i2_3 = (i3_3 + i3_7); + r2_19 = (r3_3 - r3_7); + i2_19 = (i3_3 - i3_7); + tmpr = (0.707106781187 * (r3_15 + i3_15)); + tmpi = (0.707106781187 * (i3_15 - r3_15)); + r2_7 = (r3_11 + tmpr); + i2_7 = (i3_11 + tmpi); + r2_23 = (r3_11 - tmpr); + i2_23 = (i3_11 - tmpi); + r2_11 = (r3_19 + i3_23); + i2_11 = (i3_19 - r3_23); + r2_27 = (r3_19 - i3_23); + i2_27 = (i3_19 + r3_23); + tmpr = (0.707106781187 * (i3_31 - r3_31)); + tmpi = (0.707106781187 * (r3_31 + i3_31)); + r2_15 = (r3_27 + tmpr); + i2_15 = (i3_27 - tmpi); + r2_31 = (r3_27 - tmpr); + i2_31 = (i3_27 + tmpi); + } + r1_1 = (r2_1 + r2_3); + i1_1 = (i2_1 + i2_3); + r1_17 = (r2_1 - r2_3); + i1_17 = (i2_1 - i2_3); + tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7)); + tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7)); + r1_3 = (r2_5 + tmpr); + i1_3 = (i2_5 + tmpi); + r1_19 = (r2_5 - tmpr); + i1_19 = (i2_5 - tmpi); + tmpr = (0.707106781187 * (r2_11 + i2_11)); + tmpi = (0.707106781187 * (i2_11 - r2_11)); + r1_5 = (r2_9 + tmpr); + i1_5 = (i2_9 + tmpi); + r1_21 = (r2_9 - tmpr); + i1_21 = (i2_9 - tmpi); + tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15)); + tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15)); + r1_7 = (r2_13 + tmpr); + i1_7 = (i2_13 + tmpi); + r1_23 = (r2_13 - tmpr); + i1_23 = (i2_13 - tmpi); + r1_9 = (r2_17 + i2_19); + i1_9 = (i2_17 - r2_19); + r1_25 = (r2_17 - i2_19); + i1_25 = (i2_17 + r2_19); + tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23)); + tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23)); + r1_11 = (r2_21 + tmpr); + i1_11 = (i2_21 - tmpi); + r1_27 = (r2_21 - tmpr); + i1_27 = (i2_21 + tmpi); + tmpr = (0.707106781187 * (i2_27 - r2_27)); + tmpi = (0.707106781187 * (r2_27 + i2_27)); + r1_13 = (r2_25 + tmpr); + i1_13 = (i2_25 - tmpi); + r1_29 = (r2_25 - tmpr); + i1_29 = (i2_25 + tmpi); + tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31)); + tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31)); + r1_15 = (r2_29 + tmpr); + i1_15 = (i2_29 - tmpi); + r1_31 = (r2_29 - tmpr); + i1_31 = (i2_29 + tmpi); + } + c_re(kp[0 * m]) = (r1_0 + r1_1); + c_im(kp[0 * m]) = (i1_0 + i1_1); + c_re(kp[16 * m]) = (r1_0 - r1_1); + c_im(kp[16 * m]) = (i1_0 - i1_1); + tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3)); + tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3)); + c_re(kp[1 * m]) = (r1_2 + tmpr); + c_im(kp[1 * m]) = (i1_2 + tmpi); + c_re(kp[17 * m]) = (r1_2 - tmpr); + c_im(kp[17 * m]) = (i1_2 - tmpi); + tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5)); + tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5)); + c_re(kp[2 * m]) = (r1_4 + tmpr); + c_im(kp[2 * m]) = (i1_4 + tmpi); + c_re(kp[18 * m]) = (r1_4 - tmpr); + c_im(kp[18 * m]) = (i1_4 - tmpi); + tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7)); + tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7)); + c_re(kp[3 * m]) = (r1_6 + tmpr); + c_im(kp[3 * m]) = (i1_6 + tmpi); + c_re(kp[19 * m]) = (r1_6 - tmpr); + c_im(kp[19 * m]) = (i1_6 - tmpi); + tmpr = (0.707106781187 * (r1_9 + i1_9)); + tmpi = (0.707106781187 * (i1_9 - r1_9)); + c_re(kp[4 * m]) = (r1_8 + tmpr); + c_im(kp[4 * m]) = (i1_8 + tmpi); + c_re(kp[20 * m]) = (r1_8 - tmpr); + c_im(kp[20 * m]) = (i1_8 - tmpi); + tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11)); + tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11)); + c_re(kp[5 * m]) = (r1_10 + tmpr); + c_im(kp[5 * m]) = (i1_10 + tmpi); + c_re(kp[21 * m]) = (r1_10 - tmpr); + c_im(kp[21 * m]) = (i1_10 - tmpi); + tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13)); + tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13)); + c_re(kp[6 * m]) = (r1_12 + tmpr); + c_im(kp[6 * m]) = (i1_12 + tmpi); + c_re(kp[22 * m]) = (r1_12 - tmpr); + c_im(kp[22 * m]) = (i1_12 - tmpi); + tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15)); + tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15)); + c_re(kp[7 * m]) = (r1_14 + tmpr); + c_im(kp[7 * m]) = (i1_14 + tmpi); + c_re(kp[23 * m]) = (r1_14 - tmpr); + c_im(kp[23 * m]) = (i1_14 - tmpi); + c_re(kp[8 * m]) = (r1_16 + i1_17); + c_im(kp[8 * m]) = (i1_16 - r1_17); + c_re(kp[24 * m]) = (r1_16 - i1_17); + c_im(kp[24 * m]) = (i1_16 + r1_17); + tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19)); + tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19)); + c_re(kp[9 * m]) = (r1_18 + tmpr); + c_im(kp[9 * m]) = (i1_18 - tmpi); + c_re(kp[25 * m]) = (r1_18 - tmpr); + c_im(kp[25 * m]) = (i1_18 + tmpi); + tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21)); + tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21)); + c_re(kp[10 * m]) = (r1_20 + tmpr); + c_im(kp[10 * m]) = (i1_20 - tmpi); + c_re(kp[26 * m]) = (r1_20 - tmpr); + c_im(kp[26 * m]) = (i1_20 + tmpi); + tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23)); + tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23)); + c_re(kp[11 * m]) = (r1_22 + tmpr); + c_im(kp[11 * m]) = (i1_22 - tmpi); + c_re(kp[27 * m]) = (r1_22 - tmpr); + c_im(kp[27 * m]) = (i1_22 + tmpi); + tmpr = (0.707106781187 * (i1_25 - r1_25)); + tmpi = (0.707106781187 * (r1_25 + i1_25)); + c_re(kp[12 * m]) = (r1_24 + tmpr); + c_im(kp[12 * m]) = (i1_24 - tmpi); + c_re(kp[28 * m]) = (r1_24 - tmpr); + c_im(kp[28 * m]) = (i1_24 + tmpi); + tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27)); + tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27)); + c_re(kp[13 * m]) = (r1_26 + tmpr); + c_im(kp[13 * m]) = (i1_26 - tmpi); + c_re(kp[29 * m]) = (r1_26 - tmpr); + c_im(kp[29 * m]) = (i1_26 + tmpi); + tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29)); + tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29)); + c_re(kp[14 * m]) = (r1_28 + tmpr); + c_im(kp[14 * m]) = (i1_28 - tmpi); + c_re(kp[30 * m]) = (r1_28 - tmpr); + c_im(kp[30 * m]) = (i1_28 + tmpi); + tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31)); + tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31)); + c_re(kp[15 * m]) = (r1_30 + tmpr); + c_im(kp[15 * m]) = (i1_30 - tmpi); + c_re(kp[31 * m]) = (r1_30 - tmpr); + c_im(kp[31 * m]) = (i1_30 + tmpi); + } + } + } else { + int ab = (a + b) / 2; + fft_twiddle_32_seq(a, ab, in, out, W, nW, nWdn, m); + fft_twiddle_32_seq(ab, b, in, out, W, nW, nWdn, m); + } +} +void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 32; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + #pragma omp task untied + fft_unshuffle_32(a, ab, in, out, m); + #pragma omp task untied + fft_unshuffle_32(ab, b, in, out, m); + #pragma omp taskwait + } +} +void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m) +{ + int i; + const COMPLEX *ip; + COMPLEX *jp; + if ((b - a) < 128) { + ip = in + a * 32; + for (i = a; i < b; ++i) { + jp = out + i; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + jp += 2 * m; + jp[0] = ip[0]; + jp[m] = ip[1]; + ip += 2; + } + } else { + int ab = (a + b) / 2; + fft_unshuffle_32_seq(a, ab, in, out, m); + fft_unshuffle_32_seq(ab, b, in, out, m); + } +} +/* end of machine-generated code */ + +/* + * Recursive complex FFT on the n complex components of the array in: + * basic Cooley-Tukey algorithm, with some improvements for + * n power of two. The result is placed in the array out. n is arbitrary. + * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk + * are prime numbers, and r1 * r2 * ... * rk = n. + * + * n: size of the input + * in: pointer to input + * out: pointer to output + * factors: list of factors of n, precomputed + * W: twiddle factors + * nW: size of W, that is, size of the original transform + * + */ +void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW) +{ + int r, m; + int k; + + /* special cases */ + if (n == 32) { + fft_base_32(in, out); + return; + } + if (n == 16) { + fft_base_16(in, out); + return; + } + if (n == 8) { + fft_base_8(in, out); + return; + } + if (n == 4) { + fft_base_4(in, out); + return; + } + if (n == 2) { + fft_base_2(in, out); + return; + } + /* + * the cases n == 3, n == 5, and maybe 7 should be implemented as well + */ + + r = *factors; + m = n / r; + + if (r < n) { + /* + * split the DFT of length n into r DFTs of length n/r, and + * recurse + */ + if (r == 32) { + #pragma omp task untied + fft_unshuffle_32(0, m, in, out, m); + } else if (r == 16) { + #pragma omp task untied + fft_unshuffle_16(0, m, in, out, m); + } else if (r == 8) { + #pragma omp task untied + fft_unshuffle_8(0, m, in, out, m); + } else if (r == 4) { + #pragma omp task untied + fft_unshuffle_4(0, m, in, out, m); + } else if (r == 2) { + #pragma omp task untied + fft_unshuffle_2(0, m, in, out, m); + } else + unshuffle(0, m, in, out, r, m); + + #pragma omp taskwait + + for (k = 0; k < n; k += m) { + #pragma omp task untied + fft_aux(m, out + k, in + k, factors + 1, W, nW); + } + #pragma omp taskwait + } + /* + * now multiply by the twiddle factors, and perform m FFTs + * of length r + */ + if (r == 2) { + #pragma omp task untied + fft_twiddle_2(0, m, in, out, W, nW, nW / n, m); + } else if (r == 4) { + #pragma omp task untied + fft_twiddle_4(0, m, in, out, W, nW, nW / n, m); + } else if (r == 8) { + #pragma omp task untied + fft_twiddle_8(0, m, in, out, W, nW, nW / n, m); + } else if (r == 16) { + #pragma omp task untied + fft_twiddle_16(0, m, in, out, W, nW, nW / n, m); + } else if (r == 32) { + #pragma omp task untied + fft_twiddle_32(0, m, in, out, W, nW, nW / n, m); + } else { + #pragma omp task untied + fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m); + } + + #pragma omp taskwait + + return; +} + +void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW) +{ + int r, m; + int k; + + /* special cases */ + if (n == 32) { + fft_base_32(in, out); + return; + } + if (n == 16) { + fft_base_16(in, out); + return; + } + if (n == 8) { + fft_base_8(in, out); + return; + } + if (n == 4) { + fft_base_4(in, out); + return; + } + if (n == 2) { + fft_base_2(in, out); + return; + } + /* + * the cases n == 3, n == 5, and maybe 7 should be implemented as well + */ + + r = *factors; + m = n / r; + + if (r < n) { + /* + * split the DFT of length n into r DFTs of length n/r, and + * recurse + */ + if (r == 32) fft_unshuffle_32_seq(0, m, in, out, m); + else if (r == 16) fft_unshuffle_16_seq(0, m, in, out, m); + else if (r == 8) fft_unshuffle_8_seq(0, m, in, out, m); + else if (r == 4) fft_unshuffle_4_seq(0, m, in, out, m); + else if (r == 2) fft_unshuffle_2_seq(0, m, in, out, m); + else unshuffle_seq(0, m, in, out, r, m); + + for (k = 0; k < n; k += m) { + fft_aux_seq(m, out + k, in + k, factors + 1, W, nW); + } + } + /* + * now multiply by the twiddle factors, and perform m FFTs + * of length r + */ + if (r == 2) fft_twiddle_2_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 4) fft_twiddle_4_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 8) fft_twiddle_8_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 16) fft_twiddle_16_seq(0, m, in, out, W, nW, nW / n, m); + else if (r == 32) fft_twiddle_32_seq(0, m, in, out, W, nW, nW / n, m); + else fft_twiddle_gen_seq(0, m, in, out, W, nW, nW / n, r, m); + + return; +} +/* + * user interface for fft_aux + */ +void fft(int n, COMPLEX * in, COMPLEX * out) +{ + int factors[40]; /* allows FFTs up to at least 3^40 */ + int *p = factors; + int l = n; + int r; + COMPLEX *W; + + bots_message("Computing coefficients "); + W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX)); + compute_w_coefficients(n, 0, n / 2, W); + bots_message(" completed!\n"); + + /* + * find factors of n, first 8, then 4 and then primes in ascending + * order + */ + do { + r = factor(l); + *p++ = r; + l /= r; + } while (l > 1); + + bots_message("Computing FFT "); + fft_aux(n, in, out, factors, W, n); + bots_message(" completed!\n"); + + free(W); + return; +} +void fft_seq(int n, COMPLEX * in, COMPLEX * out) +{ + int factors[40]; /* allows FFTs up to at least 3^40 */ + int *p = factors; + int l = n; + int r; + COMPLEX *W; + + W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX)); + compute_w_coefficients_seq(n, 0, n / 2, W); + + /* + * find factors of n, first 8, then 4 and then primes in ascending + * order + */ + do { + r = factor(l); + *p++ = r; + l /= r; + } while (l > 1); + + fft_aux_seq(n, in, out, factors, W, n); + + free(W); + return; +} +int test_correctness(int n, COMPLEX *out1, COMPLEX *out2) +{ + int i; + double a,d,error = 0.0; + + for (i = 0; i < n; ++i) { + a = sqrt((c_re(out1[i]) - c_re(out2[i])) * + (c_re(out1[i]) - c_re(out2[i])) + + (c_im(out1[i]) - c_im(out2[i])) * + (c_im(out1[i]) - c_im(out2[i]))); + d = sqrt(c_re(out2[i]) * c_re(out2[i]) + + c_im(out2[i]) * c_im(out2[i])); + if (d < -1.0e-10 || d > 1.0e-10) a /= d; + if (a > error) error = a; + } + bots_message("relative error=%e\n", error); + if (error > 1e-3) return BOTS_RESULT_UNSUCCESSFUL; + else return BOTS_RESULT_SUCCESSFUL; +} + diff --git a/ompss/fft/fft.h b/ompss/fft/fft.h new file mode 100644 index 0000000..ebafa9f --- /dev/null +++ b/ompss/fft/fft.h @@ -0,0 +1,55 @@ +#ifndef FFT_H +#define FFT_H + +/* our real numbers */ +typedef double REAL; + +/* Complex numbers and operations */ +typedef struct { + REAL re, im; +} COMPLEX; + +#define c_re(c) ((c).re) +#define c_im(c) ((c).im) + +void compute_w_coefficients(int n, int a, int b, COMPLEX * W); +void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W); +int factor(int n); +void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m); +void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m); +void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, COMPLEX * W, int r, int m, int nW, int nWdnti, int nWdntm); +void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m); +void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m); +void fft_base_2(COMPLEX * in, COMPLEX * out); +void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_4(COMPLEX * in, COMPLEX * out); +void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_8(COMPLEX * in, COMPLEX * out); +void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_16(COMPLEX * in, COMPLEX * out); +void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_base_32(COMPLEX * in, COMPLEX * out); +void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m); +void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m); +void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW); +void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW); +void fft(int n, COMPLEX * in, COMPLEX * out); +void fft_seq(int n, COMPLEX * in, COMPLEX * out); +int test_correctness(int n, COMPLEX *out1, COMPLEX *out2); + +#endif + diff --git a/ompss/fib/Makefile b/ompss/fib/Makefile new file mode 100644 index 0000000..c26481b --- /dev/null +++ b/ompss/fib/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#LIBS = +#PROGRAM_OBJS= + +CUTOFF_VERSIONS = manual if_clause final +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/fib/app-desc.h b/ompss/fib/app-desc.h new file mode 100644 index 0000000..9b9f71b --- /dev/null +++ b/ompss/fib/app-desc.h @@ -0,0 +1,47 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" + +#define BOTS_APP_NAME "Fibonacci" +#define BOTS_APP_PARAMETERS_DESC "N=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 10 +#define BOTS_APP_DESC_ARG_SIZE "Number to compute" + +int fib_verify(int); +void fib0 (int); +void fib0_seq (int); + +//#define KERNEL_INIT +#define KERNEL_CALL fib0(bots_arg_size) +//#define KERNEL_FINI + +//#define KERNEL_SEQ_INIT +#define KERNEL_SEQ_CALL fib0_seq(bots_arg_size) +//#define KERNEL_SEQ_FINI + + +#define KERNEL_CHECK fib_verify(bots_arg_size) + +#define BOTS_CUTOFF_DEF_VALUE 10 + diff --git a/ompss/fib/fib.c b/ompss/fib/fib.c new file mode 100644 index 0000000..828778c --- /dev/null +++ b/ompss/fib/fib.c @@ -0,0 +1,155 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "bots.h" +#include "fib.h" + +#define FIB_RESULTS_PRE 41 +long long fib_results[FIB_RESULTS_PRE] = {0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155}; + +long long fib_seq (int n) +{ + int x, y; + if (n < 2) return n; + + x = fib_seq(n - 1); + y = fib_seq(n - 2); + + return x + y; +} + +#if defined(IF_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) if(d < bots_cutoff_value) + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) if(d < bots_cutoff_value) + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(FINAL_CUTOFF) + +long long fib (int n,int d) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable + y = fib(n - 2,d+1); + + #pragma omp taskwait + return x + y; +} + +#elif defined(MANUAL_CUTOFF) + +long long fib (int n, int d) +{ + long long x, y; + if (n < 2) return n; + + if ( d < bots_cutoff_value ) { + #pragma omp task untied shared(x) firstprivate(n) + x = fib(n - 1,d+1); + + #pragma omp task untied shared(y) firstprivate(n) + y = fib(n - 2,d+1); + + #pragma omp taskwait + } else { + x = fib_seq(n-1); + y = fib_seq(n-2); + } + + return x + y; +} + +#else + +long long fib (int n) +{ + long long x, y; + if (n < 2) return n; + + #pragma omp task untied shared(x) firstprivate(n) + x = fib(n - 1); + #pragma omp task untied shared(y) firstprivate(n) + y = fib(n - 2); + + #pragma omp taskwait + return x + y; +} + +#endif + +static long long par_res, seq_res; + +void fib0 (int n) +{ +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) + par_res = fib(n,0); +#else + par_res = fib(n); +#endif + bots_message("Fibonacci result for %d is %lld\n",n,par_res); +} + +void fib0_seq (int n) +{ + seq_res = fib_seq(n); + bots_message("Fibonacci result for %d is %lld\n",n,seq_res); +} + +long long fib_verify_value(int n) +{ + if (n < FIB_RESULTS_PRE) return fib_results[n]; + return ( fib_verify_value(n-1) + fib_verify_value(n-2)); +} + +int fib_verify (int n) +{ + int result; + + if (bots_sequential_flag) + { + if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL; + else result = BOTS_RESULT_UNSUCCESSFUL; + } + else + { + seq_res = fib_verify_value(n); + if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL; + else result = BOTS_RESULT_UNSUCCESSFUL; + } + + return result; +} + diff --git a/ompss/fib/fib.h b/ompss/fib/fib.h new file mode 100644 index 0000000..e3d2983 --- /dev/null +++ b/ompss/fib/fib.h @@ -0,0 +1,40 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#ifndef FIB_H +#define FIB_H +#if defined(IF_CUTOFF) +long long fib (int n,int d); +#elif defined(FINAL_CUTOFF) +long long fib (int n,int d); +#elif defined(MANUAL_CUTOFF) +long long fib (int n,int d); +#else +long long fib (int n); +#endif + +long long fib_seq (int n); + +void fib0 (int n); +void fib0_seq (int n); + +int fib_verify (int n); +long long fib_verify_value(int n); +#endif + diff --git a/ompss/floorplan/Makefile b/ompss/floorplan/Makefile new file mode 100644 index 0000000..c26481b --- /dev/null +++ b/ompss/floorplan/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#LIBS = +#PROGRAM_OBJS= + +CUTOFF_VERSIONS = manual if_clause final +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/floorplan/app-desc.h b/ompss/floorplan/app-desc.h new file mode 100644 index 0000000..a3bba17 --- /dev/null +++ b/ompss/floorplan/app-desc.h @@ -0,0 +1,43 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" + +#define BOTS_APP_NAME "Floorplan" +#define BOTS_APP_PARAMETERS_DESC "%s" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file + +#define BOTS_APP_USES_ARG_FILE +#define BOTS_APP_DESC_ARG_FILE "Cell description file (mandatory)" + +#define BOTS_CUTOFF_DEF_VALUE 5 + +void floorplan_init(char *); +void floorplan_end (void); +void compute_floorplan(void); +int floorplan_verify(void); + +#define KERNEL_INIT floorplan_init(bots_arg_file) +#define KERNEL_CALL compute_floorplan() +#define KERNEL_FINI floorplan_end() + +#define KERNEL_CHECK floorplan_verify() + + diff --git a/ompss/floorplan/floorplan.c b/ompss/floorplan/floorplan.c new file mode 100644 index 0000000..83bdda9 --- /dev/null +++ b/ompss/floorplan/floorplan.c @@ -0,0 +1,644 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* Original code from the Application Kernel Matrix by Cray */ + +#include +#include +#include +#include "app-desc.h" +#include "bots.h" + +#define ROWS 64 +#define COLS 64 +#define DMAX 64 +#define max(a, b) ((a > b) ? a : b) +#define min(a, b) ((a < b) ? a : b) + +int solution = -1; + +typedef int coor[2]; +typedef char ibrd[ROWS][COLS]; +typedef char (*pibrd)[COLS]; + +FILE * inputFile; + +struct cell { + int n; + coor *alt; + int top; + int bot; + int lhs; + int rhs; + int left; + int above; + int next; +}; + +struct cell * gcells; + +int MIN_AREA; +ibrd BEST_BOARD; +coor MIN_FOOTPRINT; + +int N; + +/* compute all possible locations for nw corner for cell */ +static int starts(int id, int shape, coor *NWS, struct cell *cells) { + int i, n, top, bot, lhs, rhs; + int rows, cols, left, above; + +/* size of cell */ + rows = cells[id].alt[shape][0]; + cols = cells[id].alt[shape][1]; + +/* the cells to the left and above */ + left = cells[id].left; + above = cells[id].above; + +/* if there is a vertical and horizontal dependence */ + if ((left >= 0) && (above >= 0)) { + + top = cells[above].bot + 1; + lhs = cells[left].rhs + 1; + bot = top + rows; + rhs = lhs + cols; + +/* if footprint of cell touches the cells to the left and above */ + if ((top <= cells[left].bot) && (bot >= cells[left].top) && + (lhs <= cells[above].rhs) && (rhs >= cells[above].lhs)) + { n = 1; NWS[0][0] = top; NWS[0][1] = lhs; } + else { n = 0; } + +/* if there is only a horizontal dependence */ + } else if (left >= 0) { + +/* highest initial row is top of cell to the left - rows */ + top = max(cells[left].top - rows + 1, 0); +/* lowest initial row is bottom of cell to the left */ + bot = min(cells[left].bot, ROWS); + n = bot - top + 1; + + for (i = 0; i < n; i++) { + NWS[i][0] = i + top; + NWS[i][1] = cells[left].rhs + 1; + } + + } else { + +/* leftmost initial col is lhs of cell above - cols */ + lhs = max(cells[above].lhs - cols + 1, 0); +/* rightmost initial col is rhs of cell above */ + rhs = min(cells[above].rhs, COLS); + n = rhs - lhs + 1; + + for (i = 0; i < n; i++) { + NWS[i][0] = cells[above].bot + 1; + NWS[i][1] = i + lhs; + } } + + return (n); +} + + + +/* lay the cell down on the board in the rectangular space defined + by the cells top, bottom, left, and right edges. If the cell can + not be layed down, return 0; else 1. +*/ +static int lay_down(int id, ibrd board, struct cell *cells) { + int i, j, top, bot, lhs, rhs; + + top = cells[id].top; + bot = cells[id].bot; + lhs = cells[id].lhs; + rhs = cells[id].rhs; + + for (i = top; i <= bot; i++) { + for (j = lhs; j <= rhs; j++) { + if (board[i][j] == 0) board[i][j] = (char)id; + else return(0); + } } + + return (1); +} + + +#define read_integer(file,var) \ + if ( fscanf(file, "%d", &var) == EOF ) {\ + bots_message(" Bogus input file\n");\ + exit(-1);\ + } + +static void read_inputs() { + int i, j, n; + + read_integer(inputFile,n); + N = n; + + gcells = (struct cell *) malloc((n + 1) * sizeof(struct cell)); + + gcells[0].n = 0; + gcells[0].alt = 0; + gcells[0].top = 0; + gcells[0].bot = 0; + gcells[0].lhs = -1; + gcells[0].rhs = -1; + gcells[0].left = 0; + gcells[0].above = 0; + gcells[0].next = 0; + + for (i = 1; i < n + 1; i++) { + + read_integer(inputFile, gcells[i].n); + gcells[i].alt = (coor *) malloc(gcells[i].n * sizeof(coor)); + + for (j = 0; j < gcells[i].n; j++) { + read_integer(inputFile, gcells[i].alt[j][0]); + read_integer(inputFile, gcells[i].alt[j][1]); + } + + read_integer(inputFile, gcells[i].left); + read_integer(inputFile, gcells[i].above); + read_integer(inputFile, gcells[i].next); + } + + if (!feof(inputFile)) { + read_integer(inputFile, solution); + } +} + + +static void write_outputs() { + int i, j; + + bots_message("Minimum area = %d\n\n", MIN_AREA); + + for (i = 0; i < MIN_FOOTPRINT[0]; i++) { + for (j = 0; j < MIN_FOOTPRINT[1]; j++) { + if (BEST_BOARD[i][j] == 0) {bots_message(" ");} + else bots_message("%c", 'A' + BEST_BOARD[i][j] - 1); + } + bots_message("\n"); + } +} + +#ifdef MANUAL_CUTOFF +static int add_cell_ser (int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS) { + int i, j, nn, nn2, area; + + ibrd board; + coor footprint, NWS[DMAX]; + + nn2 = 0; + +/* for each possible shape */ + for (i = 0; i < CELLS[id].n; i++) { +/* compute all possible locations for nw corner */ + nn = starts(id, i, NWS, CELLS); + nn2 += nn; +/* for all possible locations */ + for (j = 0; j < nn; j++) { + struct cell *cells = CELLS; +/* extent of shape */ + cells[id].top = NWS[j][0]; + cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1; + cells[id].lhs = NWS[j][1]; + cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1; + + memcpy(board, BOARD, sizeof(ibrd)); + +/* if the cell cannot be layed down, prune search */ + if (! lay_down(id, board, cells)) { + bots_debug("Chip %d, shape %d does not fit\n", id, i); + goto _end; + } + +/* calculate new footprint of board and area of footprint */ + footprint[0] = max(FOOTPRINT[0], cells[id].bot+1); + footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1); + area = footprint[0] * footprint[1]; + +/* if last cell */ + if (cells[id].next == 0) { + +/* if area is minimum, update global values */ + if (area < MIN_AREA) { +#pragma omp critical + if (area < MIN_AREA) { + MIN_AREA = area; + MIN_FOOTPRINT[0] = footprint[0]; + MIN_FOOTPRINT[1] = footprint[1]; + memcpy(BEST_BOARD, board, sizeof(ibrd)); + bots_debug("N %d\n", MIN_AREA); + } + } + +/* if area is less than best area */ + } else if (area < MIN_AREA) { + #pragma omp atomic + nn2 += add_cell_ser(cells[id].next, footprint, board,cells); + +/* if area is greater than or equal to best area, prune search */ + } else { + + bots_debug("T %d, %d\n", area, MIN_AREA); + + } +_end:; +} +} + return nn2; + } +#endif + +#if defined(IF_CUTOFF) + +static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS,int level) { + int i, j, nn, area, nnc, nnl; + + ibrd board; + coor footprint, NWS[DMAX]; + + nnc = nnl = 0; + +/* for each possible shape */ + for (i = 0; i < CELLS[id].n; i++) { +/* compute all possible locations for nw corner */ + nn = starts(id, i, NWS, CELLS); + nnl += nn; +/* for all possible locations */ + for (j = 0; j < nn; j++) { +#pragma omp task untied private(board, footprint,area) \ + firstprivate(NWS,i,j,id,nn,level) \ + shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,nnc,bots_verbose_mode) \ + if(level= bots_cutoff_value) mergeable +{ + ibrd board; + struct cell *cells; + + if ( omp_in_final() && level > bots_cutoff_value ) { + cells = CELLS; + } else { + cells = alloca(sizeof(struct cell)*(N+1)); + memcpy(cells,CELLS,sizeof(struct cell)*(N+1)); + } + +/* extent of shape */ + cells[id].top = NWS[j][0]; + cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1; + cells[id].lhs = NWS[j][1]; + cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1; + + memcpy(board, BOARD, sizeof(ibrd)); + +/* if the cell cannot be layed down, prune search */ + if (! lay_down(id, board, cells)) { + bots_debug("Chip %d, shape %d does not fit\n", id, i); + goto _end; + } + +/* calculate new footprint of board and area of footprint */ + footprint[0] = max(FOOTPRINT[0], cells[id].bot+1); + footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1); + area = footprint[0] * footprint[1]; + +/* if last cell */ + if (cells[id].next == 0) { + +/* if area is minimum, update global values */ + if (area < MIN_AREA) { +#pragma omp critical + if (area < MIN_AREA) { + MIN_AREA = area; + MIN_FOOTPRINT[0] = footprint[0]; + MIN_FOOTPRINT[1] = footprint[1]; + memcpy(BEST_BOARD, board, sizeof(ibrd)); + bots_debug("N %d\n", MIN_AREA); + } + } + +/* if area is less than best area */ + } else if (area < MIN_AREA) { + #pragma omp atomic + nnc += add_cell(cells[id].next, footprint, board,cells,level+1); +/* if area is greater than or equal to best area, prune search */ + } else { + + bots_debug("T %d, %d\n", area, MIN_AREA); + + } +_end:; +} + } +} +#pragma omp taskwait +return nnc+nnl; +} + +#elif defined(MANUAL_CUTOFF) + +static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS,int level) { + int i, j, nn, area, nnc, nnl; + + ibrd board; + coor footprint, NWS[DMAX]; + + nnc = nnl = 0; + +/* for each possible shape */ + for (i = 0; i < CELLS[id].n; i++) { +/* compute all possible locations for nw corner */ + nn = starts(id, i, NWS, CELLS); + nnl += nn; +/* for all possible locations */ + for (j = 0; j < nn; j++) { +#pragma omp task untied private(board, footprint,area) \ + firstprivate(NWS,i,j,id,nn,level,bots_cutoff_value) shared(nnc) \ + shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,bots_verbose_mode) +{ + struct cell *cells; + + cells = alloca(sizeof(struct cell)*(N+1)); + memcpy(cells,CELLS,sizeof(struct cell)*(N+1)); + +/* extent of shape */ + cells[id].top = NWS[j][0]; + cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1; + cells[id].lhs = NWS[j][1]; + cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1; + + memcpy(board, BOARD, sizeof(ibrd)); + +/* if the cell cannot be layed down, prune search */ + if (! lay_down(id, board, cells)) { + bots_debug("Chip %d, shape %d does not fit\n", id, i); + goto _end; + } + +/* calculate new footprint of board and area of footprint */ + footprint[0] = max(FOOTPRINT[0], cells[id].bot+1); + footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1); + area = footprint[0] * footprint[1]; + +/* if last cell */ + if (cells[id].next == 0) { + +/* if area is minimum, update global values */ + if (area < MIN_AREA) { +#pragma omp critical + if (area < MIN_AREA) { + MIN_AREA = area; + MIN_FOOTPRINT[0] = footprint[0]; + MIN_FOOTPRINT[1] = footprint[1]; + memcpy(BEST_BOARD, board, sizeof(ibrd)); + bots_debug("N %d\n", MIN_AREA); + } + } + +/* if area is less than best area */ + } else if (area < MIN_AREA) { + if(level+1 < bots_cutoff_value ) { + #pragma omp atomic + nnc += add_cell(cells[id].next, footprint, board,cells,level+1); + } else { + #pragma omp atomic + nnc += add_cell_ser(cells[id].next, footprint, board,cells); + } +/* if area is greater than or equal to best area, prune search */ + } else { + bots_debug("T %d, %d\n", area, MIN_AREA); + } +_end:; +} + } +} +#pragma omp taskwait + +return nnc+nnl; +} + +#else + +static int add_cell(int id, coor FOOTPRINT, ibrd BOARD, struct cell *CELLS) { + int i, j, nn, area, nnc,nnl; + + ibrd board; + coor footprint, NWS[DMAX]; + + nnc = nnl = 0; + +/* for each possible shape */ + for (i = 0; i < CELLS[id].n; i++) { +/* compute all possible locations for nw corner */ + nn = starts(id, i, NWS, CELLS); + nnl += nn; +/* for all possible locations */ + for (j = 0; j < nn; j++) { +#pragma omp task untied private(board, footprint,area) \ + firstprivate(NWS,i,j,id,nn) \ + shared(FOOTPRINT,BOARD,CELLS,MIN_AREA,MIN_FOOTPRINT,N,BEST_BOARD,nnc,bots_verbose_mode) +{ + struct cell cells[N+1]; + memcpy(cells,CELLS,sizeof(struct cell)*(N+1)); +/* extent of shape */ + cells[id].top = NWS[j][0]; + cells[id].bot = cells[id].top + cells[id].alt[i][0] - 1; + cells[id].lhs = NWS[j][1]; + cells[id].rhs = cells[id].lhs + cells[id].alt[i][1] - 1; + + memcpy(board, BOARD, sizeof(ibrd)); + +/* if the cell cannot be layed down, prune search */ + if (! lay_down(id, board, cells)) { + bots_debug("Chip %d, shape %d does not fit\n", id, i); + goto _end; + } + +/* calculate new footprint of board and area of footprint */ + footprint[0] = max(FOOTPRINT[0], cells[id].bot+1); + footprint[1] = max(FOOTPRINT[1], cells[id].rhs+1); + area = footprint[0] * footprint[1]; + +/* if last cell */ + if (cells[id].next == 0) { + +/* if area is minimum, update global values */ + if (area < MIN_AREA) { +#pragma omp critical + if (area < MIN_AREA) { + MIN_AREA = area; + MIN_FOOTPRINT[0] = footprint[0]; + MIN_FOOTPRINT[1] = footprint[1]; + memcpy(BEST_BOARD, board, sizeof(ibrd)); + bots_debug("N %d\n", MIN_AREA); + } + } + +/* if area is less than best area */ + } else if (area < MIN_AREA) { + #pragma omp atomic + nnc += add_cell(cells[id].next, footprint, board,cells); +/* if area is greater than or equal to best area, prune search */ + } else { + + bots_debug("T %d, %d\n", area, MIN_AREA); + + } +_end:; +} + } +} +#pragma omp taskwait +return nnc+nnl; +} + +#endif + +ibrd board; + +void floorplan_init (char *filename) +{ + int i,j; + + inputFile = fopen(filename, "r"); + + if(NULL == inputFile) { + bots_message("Couldn't open %s file for reading\n", filename); + exit(1); + } + + /* read input file and initialize global minimum area */ + read_inputs(); + MIN_AREA = ROWS * COLS; + + /* initialize board is empty */ + for (i = 0; i < ROWS; i++) + for (j = 0; j < COLS; j++) board[i][j] = 0; + +} + +void compute_floorplan (void) +{ + coor footprint; + /* footprint of initial board is zero */ + footprint[0] = 0; + footprint[1] = 0; + + bots_message("Computing floorplan "); + +#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF) + bots_number_of_tasks = add_cell(1, footprint, board, gcells, 0); +#else + bots_number_of_tasks = add_cell(1, footprint, board, gcells); +#endif + + bots_message(" completed!\n"); + +} + +void floorplan_end (void) +{ + /* write results */ + write_outputs(); +} + +int floorplan_verify (void) +{ + if (solution != -1 ) + return MIN_AREA == solution ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL; + else + return BOTS_RESULT_NA; +} diff --git a/ompss/health/Makefile b/ompss/health/Makefile new file mode 100644 index 0000000..85e92c5 --- /dev/null +++ b/ompss/health/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +LIBS = -lm +#PROGRAM_OBJS= + +CUTOFF_VERSIONS = manual if_clause +TIED_VERSIONS = YES + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/health/app-desc.h b/ompss/health/app-desc.h new file mode 100644 index 0000000..f654474 --- /dev/null +++ b/ompss/health/app-desc.h @@ -0,0 +1,52 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" +#include "health.h" + +#define BOTS_APP_NAME "Health" +#define BOTS_APP_PARAMETERS_DESC "%s" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file + +//#define BOTS_APP_SELF_TIMING + +#define BOTS_APP_USES_ARG_FILE +#define BOTS_APP_DEF_ARG_FILE "Input filename" +#define BOTS_APP_DESC_ARG_FILE "Health input file (mandatory)" + +#define BOTS_CUTOFF_DEF_VALUE 2 + +#define BOTS_APP_INIT \ + struct Village *top;\ + read_input_data(bots_arg_file); + +#define KERNEL_INIT \ + allocate_village(&top, NULL, NULL, sim_level, 0); + +#define KERNEL_CALL sim_village_main_par(top); + +#define KERNEL_FINI + +//#define KERNEL_SEQ_INIT +//#define KERNEL_SEQ_CALL +//#define KERNEL_SEQ_FINI + +#define KERNEL_CHECK check_village(top); + diff --git a/ompss/health/health.c b/ompss/health/health.c new file mode 100644 index 0000000..7f670c2 --- /dev/null +++ b/ompss/health/health.c @@ -0,0 +1,637 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ + +/* OLDEN parallel C for dynamic structures: compiler, runtime system + * and benchmarks + * + * Copyright (C) 1994-1996 by Anne Rogers (amr@cs.princeton.edu) and + * Martin Carlisle (mcc@cs.princeton.edu) + * ALL RIGHTS RESERVED. + * + * OLDEN is distributed under the following conditions: + * + * You may make copies of OLDEN for your own use and modify those copies. + * + * All copies of OLDEN must retain our names and copyright notice. + * + * You may not sell OLDEN or distribute OLDEN in conjunction with a + * commercial product or service without the expressed written consent of + * Anne Rogers and Martin Carlisle. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + * + */ + + +/******************************************************************* + * Health.c : Model of the Colombian Health Care System * + *******************************************************************/ +#include +#include +#include +#include +#include "app-desc.h" +#include "bots.h" +#include "health.h" + +/* global variables */ +int sim_level; +int sim_cities; +int sim_population_ratio; +int sim_time; +int sim_assess_time; +int sim_convalescence_time; +int32_t sim_seed; +float sim_get_sick_p; +float sim_convalescence_p; +float sim_realloc_p; +int sim_pid = 0; + +int res_population; +int res_hospitals; +int res_personnel; +int res_checkin; +int res_village; +int res_waiting; +int res_assess; +int res_inside; +float res_avg_stay; + +/********************************************************** + * Handles math routines for health.c * + **********************************************************/ +float my_rand(int32_t *seed) +{ + int32_t k; + int32_t idum = *seed; + + idum ^= MASK; + k = idum / IQ; + idum = IA * (idum - k * IQ) - IR * k; + idum ^= MASK; + if (idum < 0) idum += IM; + *seed = idum * IM; + return (float) AM * idum; +} +/******************************************************************** + * Handles lists. * + ********************************************************************/ +void addList(struct Patient **list, struct Patient *patient) +{ + if (*list == NULL) + { + *list = patient; + patient->back = NULL; + patient->forward = NULL; + } + else + { + struct Patient *aux = *list; + while (aux->forward != NULL) aux = aux->forward; + aux->forward = patient; + patient->back = aux; + patient->forward = NULL; + } +} +void removeList(struct Patient **list, struct Patient *patient) +{ +#if 0 + struct Patient *aux = *list; + + if (patient == NULL) return; + while((aux != NULL) && (aux != patient)) aux = aux->forward; + + // Patient not found + if (aux == NULL) return; + + // Removing patient + if (aux->back != NULL) aux->back->forward = aux->forward; + else *list = aux->forward; + if (aux->forward != NULL) aux->forward->back = aux->back; +#else + if (patient->back != NULL) patient->back->forward = patient->forward; + else *list = patient->forward; + if (patient->forward != NULL) patient->forward->back = patient->back; +#endif +} +/**********************************************************************/ +void allocate_village( struct Village **capital, struct Village *back, + struct Village *next, int level, int32_t vid) +{ + int i, population, personnel; + struct Village *current, *inext; + struct Patient *patient; + + if (level == 0) *capital = NULL; + else + { + personnel = (int) pow(2, level); + population = personnel * sim_population_ratio; + /* Allocate Village */ + *capital = (struct Village *) malloc(sizeof(struct Village)); + /* Initialize Village */ + (*capital)->back = back; + (*capital)->next = next; + (*capital)->level = level; + (*capital)->id = vid; + (*capital)->seed = vid * (IQ + sim_seed); + (*capital)->population = NULL; + for(i=0;iid = sim_pid++; + patient->seed = (*capital)->seed; + // changes seed for capital: + my_rand(&((*capital)->seed)); + patient->hosps_visited = 0; + patient->time = 0; + patient->time_left = 0; + patient->home_village = *capital; + addList(&((*capital)->population), patient); + } + /* Initialize Hospital */ + (*capital)->hosp.personnel = personnel; + (*capital)->hosp.free_personnel = personnel; + (*capital)->hosp.assess = NULL; + (*capital)->hosp.waiting = NULL; + (*capital)->hosp.inside = NULL; + (*capital)->hosp.realloc = NULL; + omp_init_lock(&(*capital)->hosp.realloc_lock); + // Create Cities (lower level) + inext = NULL; + for (i = sim_cities; i>0; i--) + { + allocate_village(¤t, *capital, inext, level-1, (vid * (int32_t) sim_cities)+ (int32_t) i); + inext = current; + } + (*capital)->forward = current; + } +} +/**********************************************************************/ +struct Results get_results(struct Village *village) +{ + struct Village *vlist; + struct Patient *p; + struct Results t_res, p_res; + + t_res.hosps_number = 0.0; + t_res.hosps_personnel = 0.0; + t_res.total_patients = 0.0; + t_res.total_in_village = 0.0; + t_res.total_waiting = 0.0; + t_res.total_assess = 0.0; + t_res.total_inside = 0.0; + t_res.total_hosps_v = 0.0; + t_res.total_time = 0.0; + + if (village == NULL) return t_res; + + /* Traverse village hierarchy (lower level first)*/ + vlist = village->forward; + while(vlist) + { + p_res = get_results(vlist); + t_res.hosps_number += p_res.hosps_number; + t_res.hosps_personnel += p_res.hosps_personnel; + t_res.total_patients += p_res.total_patients; + t_res.total_in_village += p_res.total_in_village; + t_res.total_waiting += p_res.total_waiting; + t_res.total_assess += p_res.total_assess; + t_res.total_inside += p_res.total_inside; + t_res.total_hosps_v += p_res.total_hosps_v; + t_res.total_time += p_res.total_time; + vlist = vlist->next; + } + t_res.hosps_number += 1.0; + t_res.hosps_personnel += village->hosp.personnel; + + // Patients in the village + p = village->population; + while (p != NULL) + { + t_res.total_patients += 1.0; + t_res.total_in_village += 1.0; + t_res.total_hosps_v += (float)(p->hosps_visited); + t_res.total_time += (float)(p->time); + p = p->forward; + } + // Patients in hospital: waiting + p = village->hosp.waiting; + while (p != NULL) + { + t_res.total_patients += 1.0; + t_res.total_waiting += 1.0; + t_res.total_hosps_v += (float)(p->hosps_visited); + t_res.total_time += (float)(p->time); + p = p->forward; + } + // Patients in hospital: assess + p = village->hosp.assess; + while (p != NULL) + { + t_res.total_patients += 1.0; + t_res.total_assess += 1.0; + t_res.total_hosps_v += (float)(p->hosps_visited); + t_res.total_time += (float)(p->time); + p = p->forward; + } + // Patients in hospital: inside + p = village->hosp.inside; + while (p != NULL) + { + t_res.total_patients += 1.0; + t_res.total_inside += 1.0; + t_res.total_hosps_v += (float)(p->hosps_visited); + t_res.total_time += (float)(p->time); + p = p->forward; + } + + return t_res; +} +/**********************************************************************/ +/**********************************************************************/ +/**********************************************************************/ +void check_patients_inside(struct Village *village) +{ + struct Patient *list = village->hosp.inside; + struct Patient *p; + + while (list != NULL) + { + p = list; + list = list->forward; + p->time_left--; + if (p->time_left == 0) + { + village->hosp.free_personnel++; + removeList(&(village->hosp.inside), p); + addList(&(village->population), p); + } + } +} +/**********************************************************************/ +void check_patients_assess_par(struct Village *village) +{ + struct Patient *list = village->hosp.assess; + float rand; + struct Patient *p; + + while (list != NULL) + { + p = list; + list = list->forward; + p->time_left--; + + if (p->time_left == 0) + { + rand = my_rand(&(p->seed)); + /* sim_covalescense_p % */ + if (rand < sim_convalescence_p) + { + rand = my_rand(&(p->seed)); + /* !sim_realloc_p % or root hospital */ + if (rand > sim_realloc_p || village->level == sim_level) + { + removeList(&(village->hosp.assess), p); + addList(&(village->hosp.inside), p); + p->time_left = sim_convalescence_time; + p->time += p->time_left; + } + else /* move to upper level hospital !!! */ + { + village->hosp.free_personnel++; + removeList(&(village->hosp.assess), p); + omp_set_lock(&(village->hosp.realloc_lock)); + addList(&(village->back->hosp.realloc), p); + omp_unset_lock(&(village->hosp.realloc_lock)); + } + } + else /* move to village */ + { + village->hosp.free_personnel++; + removeList(&(village->hosp.assess), p); + addList(&(village->population), p); + } + } + } +} +/**********************************************************************/ +void check_patients_waiting(struct Village *village) +{ + struct Patient *list = village->hosp.waiting; + struct Patient *p; + + while (list != NULL) + { + p = list; + list = list->forward; + if (village->hosp.free_personnel > 0) + { + village->hosp.free_personnel--; + p->time_left = sim_assess_time; + p->time += p->time_left; + removeList(&(village->hosp.waiting), p); + addList(&(village->hosp.assess), p); + } + else + { + p->time++; + } + } +} +/**********************************************************************/ +void check_patients_realloc(struct Village *village) +{ + struct Patient *p, *s; + + while (village->hosp.realloc != NULL) + { + p = s = village->hosp.realloc; + while (p != NULL) + { + if (p->id < s->id) s = p; + p = p->forward; + } + removeList(&(village->hosp.realloc), s); + put_in_hosp(&(village->hosp), s); + } +} +/**********************************************************************/ +void check_patients_population(struct Village *village) +{ + struct Patient *list = village->population; + struct Patient *p; + float rand; + + while (list != NULL) + { + p = list; + list = list->forward; + /* randomize in patient */ + rand = my_rand(&(p->seed)); + if (rand < sim_get_sick_p) + { + removeList(&(village->population), p); + put_in_hosp(&(village->hosp), p); + } + } + +} +/**********************************************************************/ +void put_in_hosp(struct Hosp *hosp, struct Patient *patient) +{ + (patient->hosps_visited)++; + + if (hosp->free_personnel > 0) + { + hosp->free_personnel--; + addList(&(hosp->assess), patient); + patient->time_left = sim_assess_time; + patient->time += patient->time_left; + } + else + { + addList(&(hosp->waiting), patient); + } +} +/**********************************************************************/ +#if defined (IF_CUTOFF) +void sim_village_par(struct Village *village) +{ + struct Village *vlist; + + // lowest level returns nothing + // only for sim_village first call with village = NULL + // recursive call cannot occurs + if (village == NULL) return; + + /* Traverse village hierarchy (lower level first)*/ + vlist = village->forward; + while(vlist) + { +#pragma omp task untied if((sim_level - village->level) < bots_cutoff_value) + sim_village_par(vlist); + vlist = vlist->next; + } + + /* Uses lists v->hosp->inside, and v->return */ + check_patients_inside(village); + + /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */ + check_patients_assess_par(village); + + /* Uses lists v->hosp->waiting, and v->hosp->assess */ + check_patients_waiting(village); + +#pragma omp taskwait + + /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */ + check_patients_realloc(village); + + /* Uses list v->population, v->hosp->asses and v->h->waiting */ + check_patients_population(village); +} +#elif defined (MANUAL_CUTOFF) +void sim_village_par(struct Village *village) +{ + struct Village *vlist; + + // lowest level returns nothing + // only for sim_village first call with village = NULL + // recursive call cannot occurs + if (village == NULL) return; + + /* Traverse village hierarchy (lower level first)*/ + vlist = village->forward; + if ((sim_level-village->level) < bots_cutoff_value) + { + while(vlist) + { +#pragma omp task untied + sim_village_par(vlist); + vlist = vlist->next; + } + } + else + { + while(vlist) + { + sim_village_par(vlist); + vlist = vlist->next; + } + } + + /* Uses lists v->hosp->inside, and v->return */ + check_patients_inside(village); + + /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */ + check_patients_assess_par(village); + + /* Uses lists v->hosp->waiting, and v->hosp->assess */ + check_patients_waiting(village); + + if ((sim_level-village->level) < bots_cutoff_value) + { +#pragma omp taskwait + } + + /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */ + check_patients_realloc(village); + + /* Uses list v->population, v->hosp->asses and v->h->waiting */ + check_patients_population(village); +} +#else +void sim_village_par(struct Village *village) +{ + struct Village *vlist; + + // lowest level returns nothing + // only for sim_village first call with village = NULL + // recursive call cannot occurs + if (village == NULL) return; + + /* Traverse village hierarchy (lower level first)*/ + vlist = village->forward; + while(vlist) + { +#pragma omp task untied + sim_village_par(vlist); + vlist = vlist->next; + } + + /* Uses lists v->hosp->inside, and v->return */ + check_patients_inside(village); + + /* Uses lists v->hosp->assess, v->hosp->inside, v->population and (v->back->hosp->realloc) !!! */ + check_patients_assess_par(village); + + /* Uses lists v->hosp->waiting, and v->hosp->assess */ + check_patients_waiting(village); + +#pragma omp taskwait + + /* Uses lists v->hosp->realloc, v->hosp->asses and v->hosp->waiting */ + check_patients_realloc(village); + + /* Uses list v->population, v->hosp->asses and v->h->waiting */ + check_patients_population(village); +} +#endif +/**********************************************************************/ +void my_print(struct Village *village) +{ + struct Village *vlist; + struct Patient *plist; + + if (village == NULL) return; + + /* Traverse village hierarchy (lower level first)*/ + vlist = village->forward; + while(vlist) { + my_print(vlist); + vlist = vlist->next; + } + + plist = village->population; + + while (plist != NULL) { + bots_debug("[pid:%d]",plist->id); + plist = plist->forward; + } + bots_debug("[vid:%d]\n",village->id); + +} +/**********************************************************************/ +void read_input_data(char *filename) +{ + FILE *fin; + int res; + + if ((fin = fopen(filename, "r")) == NULL) { + bots_message("Could not open sequence file (%s)\n", filename); + exit (-1); + } + res = fscanf(fin,"%d %d %d %d %d %d %ld %f %f %f %d %d %d %d %d %d %d %d %f", + &sim_level, + &sim_cities, + &sim_population_ratio, + &sim_time, + &sim_assess_time, + &sim_convalescence_time, + &sim_seed, + &sim_get_sick_p, + &sim_convalescence_p, + &sim_realloc_p, + &res_population, + &res_hospitals, + &res_personnel, + &res_checkin, + &res_village, + &res_waiting, + &res_assess, + &res_inside, + &res_avg_stay + ); + if ( res == EOF ) { + bots_message("Bogus input file (%s)\n", filename); + exit(-1); + } + fclose(fin); + + // Printing input data + bots_message("\n"); + bots_message("Number of levels = %d\n", (int) sim_level); + bots_message("Cities per level = %d\n", (int) sim_cities); + bots_message("Population ratio = %d\n", (int) sim_population_ratio); + bots_message("Simulation time = %d\n", (int) sim_time); + bots_message("Assess time = %d\n", (int) sim_assess_time); + bots_message("Convalescence time = %d\n", (int) sim_convalescence_time); + bots_message("Initial seed = %d\n", (int) sim_seed); + bots_message("Get sick prob. = %f\n", (float) sim_get_sick_p); + bots_message("Convalescence prob. = %f\n", (float) sim_convalescence_p); + bots_message("Realloc prob. = %f\n", (float) sim_realloc_p); +} +int check_village(struct Village *top) +{ + struct Results result = get_results(top); + int answer = BOTS_RESULT_SUCCESSFUL; + + if (res_population != result.total_patients) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_hospitals != result.hosps_number) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_personnel != result.hosps_personnel) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_checkin != result.total_hosps_v) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_village != result.total_in_village) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_waiting != result.total_waiting) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_assess != result.total_assess) answer = BOTS_RESULT_UNSUCCESSFUL; + if (res_inside != result.total_inside) answer = BOTS_RESULT_UNSUCCESSFUL; + + bots_message("\n"); + bots_message("Sim. Variables = expect / result\n"); + bots_message("Total population = %6d / %6d people\n", (int) res_population, (int) result.total_patients); + bots_message("Hospitals = %6d / %6d people\n", (int) res_hospitals, (int) result.hosps_number); + bots_message("Personnel = %6d / %6d people\n", (int) res_personnel, (int) result.hosps_personnel); + bots_message("Check-in's = %6d / %6d people\n", (int) res_checkin, (int) result.total_hosps_v); + bots_message("In Villages = %6d / %6d people\n", (int) res_village, (int) result.total_in_village); + bots_message("In Waiting List = %6d / %6d people\n", (int) res_waiting, (int) result.total_waiting); + bots_message("In Assess = %6d / %6d people\n", (int) res_assess, (int) result.total_assess); + bots_message("Inside Hospital = %6d / %6d people\n", (int) res_inside, (int) result.total_inside); + bots_message("Average Stay = %6f / %6f u/time\n", (float) res_avg_stay,(float) result.total_time/result.total_patients); + + my_print(top); + + return answer; +} +/**********************************************************************/ +void sim_village_main_par(struct Village *top) +{ + long i; + for (i = 0; i < sim_time; i++) sim_village_par(top); +} + diff --git a/ompss/health/health.h b/ompss/health/health.h new file mode 100644 index 0000000..4fc293c --- /dev/null +++ b/ompss/health/health.h @@ -0,0 +1,106 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#ifndef _HEALTH_H +#define _HEALTH_H +/* random defines */ +#define IA 16807 +#define IM 2147483647 +#define AM (1.0 / IM) +#define IQ 127773 +#define IR 2836 +#define MASK 123459876 + +struct Results { + long hosps_number; + long hosps_personnel; + long total_patients; + long total_in_village; + long total_waiting; + long total_assess; + long total_inside; + long total_time; + long total_hosps_v; +}; + +extern int sim_level; + +struct Patient { + int id; + int32_t seed; + int time; + int time_left; + int hosps_visited; + struct Village *home_village; + struct Patient *back; + struct Patient *forward; +}; +struct Hosp { + int personnel; + int free_personnel; + struct Patient *waiting; + struct Patient *assess; + struct Patient *inside; + struct Patient *realloc; + omp_lock_t realloc_lock; +}; +struct Village { + int id; + struct Village *back; + struct Village *next; + struct Village *forward; + struct Patient *population; + struct Hosp hosp; + int level; + int32_t seed; +}; + +float my_rand(int32_t *seed); + +struct Patient *generate_patient(struct Village *village); +void put_in_hosp(struct Hosp *hosp, struct Patient *patient); + +void addList(struct Patient **list, struct Patient *patient); +void removeList(struct Patient **list, struct Patient *patient); + +void check_patients_inside(struct Village *village); +void check_patients_waiting(struct Village *village); +void check_patients_realloc(struct Village *village); + +void check_patients_assess_par(struct Village *village); + +float get_num_people(struct Village *village); +float get_total_time(struct Village *village); +float get_total_hosps(struct Village *village); + +struct Results get_results(struct Village *village); + +void read_input_data(char *filename); +void allocate_village( struct Village **capital, struct Village *back, struct Village *next, int level, int32_t vid); +void sim_village_main_par(struct Village *top); + +void sim_village_par(struct Village *village); +int check_village(struct Village *top); + +void check_patients_assess(struct Village *village); +void check_patients_population(struct Village *village); +void sim_village(struct Village *village); +void my_print(struct Village *village); + +#endif diff --git a/ompss/nqueens/Makefile b/ompss/nqueens/Makefile new file mode 100644 index 0000000..c26481b --- /dev/null +++ b/ompss/nqueens/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#LIBS = +#PROGRAM_OBJS= + +CUTOFF_VERSIONS = manual if_clause final +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/nqueens/app-desc.h b/ompss/nqueens/app-desc.h new file mode 100644 index 0000000..60c7b99 --- /dev/null +++ b/ompss/nqueens/app-desc.h @@ -0,0 +1,43 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" + +#define BOTS_APP_NAME "N Queens" +#define BOTS_APP_PARAMETERS_DESC "N=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 14 +#define BOTS_APP_DESC_ARG_SIZE "Board size" + +int ok(int n, char *a); + +void nqueens(int n, int j, char *a, int *solutions, int depth); + +void nqueens_ser (int n, int j, char *a, int *solutions); + +int verify_queens(int); +void find_queens (int); + +#define KERNEL_CALL find_queens(bots_arg_size) +#define KERNEL_CHECK verify_queens(bots_arg_size) + +#define BOTS_CUTOFF_DEF_VALUE 3 diff --git a/ompss/nqueens/nqueens.c b/ompss/nqueens/nqueens.c new file mode 100644 index 0000000..471cfeb --- /dev/null +++ b/ompss/nqueens/nqueens.c @@ -0,0 +1,290 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* + * Original code from the Cilk project (by Keith Randall) + * + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + */ + +#include +#include +#include +#include +#include "bots.h" +#include "app-desc.h" +#include + + +/* Checking information */ + +static int solutions[] = { + 1, + 0, + 0, + 2, + 10, /* 5 */ + 4, + 40, + 92, + 352, + 724, /* 10 */ + 2680, + 14200, + 73712, + 365596, +}; +#define MAX_SOLUTIONS sizeof(solutions)/sizeof(int) + +int total_count; + +/* + * contains array of queen positions. Returns 1 + * if none of the queens conflict, and returns 0 otherwise. + */ +int ok(int n, char *a) +{ + int i, j; + char p, q; + + for (i = 0; i < n; i++) { + p = a[i]; + + for (j = i + 1; j < n; j++) { + q = a[j]; + if (q == p || q == p - (j - i) || q == p + (j - i)) + return 0; + } + } + return 1; +} + +void nqueens_ser (int n, int j, char *a, int *solutions) +{ + int res; + int i; + + if (n == j) { + /* good solution, count it */ + *solutions = 1; + return; + } + + *solutions = 0; + + /* try each possible position for queen */ + for (i = 0; i < n; i++) { + { + /* allocate a temporary array and copy into it */ + a[j] = (char) i; + if (ok(j + 1, a)) { + nqueens_ser(n, j + 1, a,&res); + *solutions += res; + } + } + } +} + +#if defined(IF_CUTOFF) + +void nqueens(int n, int j, char *a, int *solutions, int depth) +{ + int *csols; + int i; + + if (n == j) { + /* good solution, count it */ + *solutions = 1; + return; + } + + + *solutions = 0; + csols = alloca(n*sizeof(int)); + memset(csols,0,n*sizeof(int)); + + /* try each possible position for queen */ + for (i = 0; i < n; i++) { + #pragma omp task untied if(depth < bots_cutoff_value) + { + /* allocate a temporary array and copy into it */ + char * b = alloca(n * sizeof(char)); + memcpy(b, a, j * sizeof(char)); + b[j] = (char) i; + if (ok(j + 1, b)) + nqueens(n, j + 1, b,&csols[i],depth+1); + } + } + + #pragma omp taskwait + for ( i = 0; i < n; i++) *solutions += csols[i]; +} + +#elif defined(FINAL_CUTOFF) + +void nqueens(int n, int j, char *a, int *solutions, int depth) +{ + int *csols; + int i; + + + if (n == j) { + /* good solution, count it */ + *solutions += 1; + return; + } + + + char final = omp_in_final(); + if ( !final ) { + *solutions = 0; + csols = alloca(n*sizeof(int)); + memset(csols,0,n*sizeof(int)); + } + + /* try each possible position for queen */ + for (i = 0; i < n; i++) { + #pragma omp task untied final(depth+1 >= bots_cutoff_value) mergeable + { + char *b; + int *sol; + if ( omp_in_final() && depth+1 > bots_cutoff_value ) { + b = a; + sol = solutions; + } else { + /* allocate a temporary array and copy into it */ + b = alloca(n * sizeof(char)); + memcpy(b, a, j * sizeof(char)); + sol = &csols[i]; + } + b[j] = i; + if (ok(j + 1, b)) + nqueens(n, j + 1, b,sol,depth+1); + } + } + + #pragma omp taskwait + if ( !final ) { + for ( i = 0; i < n; i++) *solutions += csols[i]; + } +} + +#elif defined(MANUAL_CUTOFF) + +void nqueens(int n, int j, char *a, int *solutions, int depth) +{ + int *csols; + int i; + + + if (n == j) { + /* good solution, count it */ + *solutions = 1; + return; + } + + + *solutions = 0; + csols = alloca(n*sizeof(int)); + memset(csols,0,n*sizeof(int)); + + /* try each possible position for queen */ + for (i = 0; i < n; i++) { + if ( depth < bots_cutoff_value ) { + #pragma omp task untied + { + /* allocate a temporary array and copy into it */ + char * b = alloca(n * sizeof(char)); + memcpy(b, a, j * sizeof(char)); + b[j] = (char) i; + if (ok(j + 1, b)) + nqueens(n, j + 1, b,&csols[i],depth+1); + } + } else { + a[j] = (char) i; + if (ok(j + 1, a)) + nqueens_ser(n, j + 1, a,&csols[i]); + } + } + + #pragma omp taskwait + for ( i = 0; i < n; i++) *solutions += csols[i]; +} + + +#else + +void nqueens(int n, int j, char *a, int *solutions, int depth) +{ + int *csols; + int i; + + + if (n == j) { + /* good solution, count it */ + *solutions = 1; + return; + } + + *solutions = 0; + csols = alloca(n*sizeof(int)); + memset(csols,0,n*sizeof(int)); + + /* try each possible position for queen */ + for (i = 0; i < n; i++) { + #pragma omp task untied + { + /* allocate a temporary array and copy into it */ + char * b = alloca(n * sizeof(char)); + memcpy(b, a, j * sizeof(char)); + b[j] = (char) i; + if (ok(j + 1, b)) + nqueens(n, j + 1, b,&csols[i],depth); //FIXME: depth or depth+1 ??? + } + } + + #pragma omp taskwait + for ( i = 0; i < n; i++) *solutions += csols[i]; +} + +#endif + +void find_queens (int size) +{ + total_count=0; + + bots_message("Computing N-Queens algorithm (n=%d) ", size); + + char *a; + + a = alloca(size * sizeof(char)); + nqueens(size, 0, a, &total_count,0); + + bots_message(" completed!\n"); +} + + +int verify_queens (int size) +{ + if ( size > MAX_SOLUTIONS ) return BOTS_RESULT_NA; + if ( total_count == solutions[size-1]) return BOTS_RESULT_SUCCESSFUL; + return BOTS_RESULT_UNSUCCESSFUL; +} diff --git a/ompss/sort/Makefile b/ompss/sort/Makefile new file mode 100644 index 0000000..2e5be6a --- /dev/null +++ b/ompss/sort/Makefile @@ -0,0 +1,35 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#LIBS = +#PROGRAM_OBJS= + +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/sort/app-desc.h b/ompss/sort/app-desc.h new file mode 100644 index 0000000..bea28f8 --- /dev/null +++ b/ompss/sort/app-desc.h @@ -0,0 +1,66 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" + +#define BOTS_APP_NAME "Sort" +#define BOTS_APP_PARAMETERS_DESC "N=%d:Q=%d:I=%d:M=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value_1,bots_app_cutoff_value_2,bots_app_cutoff_value + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE (32*1024*1024) +#define BOTS_APP_DESC_ARG_SIZE "Array size" + +#define BOTS_APP_USES_ARG_CUTOFF +#define BOTS_APP_DEF_ARG_CUTOFF (2*1024) +#define BOTS_APP_DESC_ARG_CUTOFF "Sequential Merge cutoff value" + +#define BOTS_APP_USES_ARG_CUTOFF_1 +#define BOTS_APP_DEF_ARG_CUTOFF_1 (2*1024) +#define BOTS_APP_DESC_ARG_CUTOFF_1 "Sequential Quicksort cutoff value" + +#define BOTS_APP_USES_ARG_CUTOFF_2 +#define BOTS_APP_DEF_ARG_CUTOFF_2 (20) +#define BOTS_APP_DESC_ARG_CUTOFF_2 "Sequential Insertion cutoff value" + +typedef long ELM; + +void seqquick(ELM *low, ELM *high); +void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +ELM *binsplit(ELM val, ELM *low, ELM *high); +void cilkmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest); +void cilksort(ELM *low, ELM *tmp, long size); +void cilksort_par(ELM *low, ELM *tmp, long size); +void scramble_array( ELM *array ); +void fill_array( ELM *array ); +void sort ( void ); + +void sort_par (void); +void sort_init (void); +int sort_verify (void); + +#define BOTS_APP_INIT sort_init() + +#define KERNEL_INIT +#define KERNEL_CALL sort_par() +#define KERNEL_CHECK sort_verify() + + diff --git a/ompss/sort/sort.c b/ompss/sort/sort.c new file mode 100644 index 0000000..9109afd --- /dev/null +++ b/ompss/sort/sort.c @@ -0,0 +1,485 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +/* + * Original code from the Cilk project + * + * Copyright (c) 2000 Massachusetts Institute of Technology + * Copyright (c) 2000 Matteo Frigo + */ + +/* + * this program uses an algorithm that we call `cilksort'. + * The algorithm is essentially mergesort: + * + * cilksort(in[1..n]) = + * spawn cilksort(in[1..n/2], tmp[1..n/2]) + * spawn cilksort(in[n/2..n], tmp[n/2..n]) + * sync + * spawn cilkmerge(tmp[1..n/2], tmp[n/2..n], in[1..n]) + * + * + * The procedure cilkmerge does the following: + * + * cilkmerge(A[1..n], B[1..m], C[1..(n+m)]) = + * find the median of A \union B using binary + * search. The binary search gives a pair + * (ma, mb) such that ma + mb = (n + m)/2 + * and all elements in A[1..ma] are smaller than + * B[mb..m], and all the B[1..mb] are smaller + * than all elements in A[ma..n]. + * + * spawn cilkmerge(A[1..ma], B[1..mb], C[1..(n+m)/2]) + * spawn cilkmerge(A[ma..m], B[mb..n], C[(n+m)/2 .. (n+m)]) + * sync + * + * The algorithm appears for the first time (AFAIK) in S. G. Akl and + * N. Santoro, "Optimal Parallel Merging and Sorting Without Memory + * Conflicts", IEEE Trans. Comp., Vol. C-36 No. 11, Nov. 1987 . The + * paper does not express the algorithm using recursion, but the + * idea of finding the median is there. + * + * For cilksort of n elements, T_1 = O(n log n) and + * T_\infty = O(log^3 n). There is a way to shave a + * log factor in the critical path (left as homework). + */ + +#include +#include +#include +#include "bots.h" +#include "app-desc.h" + +ELM *array, *tmp; + +static unsigned long rand_nxt = 0; + +static inline unsigned long my_rand(void) +{ + rand_nxt = rand_nxt * 1103515245 + 12345; + return rand_nxt; +} + +static inline void my_srand(unsigned long seed) +{ + rand_nxt = seed; +} + +static inline ELM med3(ELM a, ELM b, ELM c) +{ + if (a < b) { + if (b < c) { + return b; + } else { + if (a < c) + return c; + else + return a; + } + } else { + if (b > c) { + return b; + } else { + if (a > c) + return c; + else + return a; + } + } +} + +/* + * simple approach for now; a better median-finding + * may be preferable + */ +static inline ELM choose_pivot(ELM *low, ELM *high) +{ + return med3(*low, *high, low[(high - low) / 2]); +} + +static ELM *seqpart(ELM *low, ELM *high) +{ + ELM pivot; + ELM h, l; + ELM *curr_low = low; + ELM *curr_high = high; + + pivot = choose_pivot(low, high); + + while (1) { + while ((h = *curr_high) > pivot) + curr_high--; + + while ((l = *curr_low) < pivot) + curr_low++; + + if (curr_low >= curr_high) + break; + + *curr_high-- = l; + *curr_low++ = h; + } + + /* + * I don't know if this is really necessary. + * The problem is that the pivot is not always the + * first element, and the partition may be trivial. + * However, if the partition is trivial, then + * *high is the largest element, whence the following + * code. + */ + if (curr_high < high) + return curr_high; + else + return curr_high - 1; +} + +#define swap(a, b) \ +{ \ + ELM tmp;\ + tmp = a;\ + a = b;\ + b = tmp;\ +} + +static void insertion_sort(ELM *low, ELM *high) +{ + ELM *p, *q; + ELM a, b; + + for (q = low + 1; q <= high; ++q) { + a = q[0]; + for (p = q - 1; p >= low && (b = p[0]) > a; p--) + p[1] = b; + p[1] = a; + } +} + +/* + * tail-recursive quicksort, almost unrecognizable :-) + */ +void seqquick(ELM *low, ELM *high) +{ + ELM *p; + + while (high - low >= bots_app_cutoff_value_2) { + p = seqpart(low, high); + seqquick(low, p); + low = p + 1; + } + + insertion_sort(low, high); +} + +void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, + ELM *lowdest) +{ + ELM a1, a2; + + /* + * The following 'if' statement is not necessary + * for the correctness of the algorithm, and is + * in fact subsumed by the rest of the function. + * However, it is a few percent faster. Here is why. + * + * The merging loop below has something like + * if (a1 < a2) { + * *dest++ = a1; + * ++low1; + * if (end of array) break; + * a1 = *low1; + * } + * + * Now, a1 is needed immediately in the next iteration + * and there is no way to mask the latency of the load. + * A better approach is to load a1 *before* the end-of-array + * check; the problem is that we may be speculatively + * loading an element out of range. While this is + * probably not a problem in practice, yet I don't feel + * comfortable with an incorrect algorithm. Therefore, + * I use the 'fast' loop on the array (except for the last + * element) and the 'slow' loop for the rest, saving both + * performance and correctness. + */ + + if (low1 < high1 && low2 < high2) { + a1 = *low1; + a2 = *low2; + for (;;) { + if (a1 < a2) { + *lowdest++ = a1; + a1 = *++low1; + if (low1 >= high1) + break; + } else { + *lowdest++ = a2; + a2 = *++low2; + if (low2 >= high2) + break; + } + } + } + if (low1 <= high1 && low2 <= high2) { + a1 = *low1; + a2 = *low2; + for (;;) { + if (a1 < a2) { + *lowdest++ = a1; + ++low1; + if (low1 > high1) + break; + a1 = *low1; + } else { + *lowdest++ = a2; + ++low2; + if (low2 > high2) + break; + a2 = *low2; + } + } + } + if (low1 > high1) { + memcpy(lowdest, low2, sizeof(ELM) * (high2 - low2 + 1)); + } else { + memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1 + 1)); + } +} + +#define swap_indices(a, b) \ +{ \ + ELM *tmp;\ + tmp = a;\ + a = b;\ + b = tmp;\ +} + +ELM *binsplit(ELM val, ELM *low, ELM *high) +{ + /* + * returns index which contains greatest element <= val. If val is + * less than all elements, returns low-1 + */ + ELM *mid; + + while (low != high) { + mid = low + ((high - low + 1) >> 1); + if (val <= *mid) + high = mid - 1; + else + low = mid; + } + + if (*low > val) + return low - 1; + else + return low; +} + + +void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest) +{ + /* + * Cilkmerge: Merges range [low1, high1] with range [low2, high2] + * into the range [lowdest, ...] + */ + + ELM *split1, *split2; /* + * where each of the ranges are broken for + * recursive merge + */ + long int lowsize; /* + * total size of lower halves of two + * ranges - 2 + */ + + /* + * We want to take the middle element (indexed by split1) from the + * larger of the two arrays. The following code assumes that split1 + * is taken from range [low1, high1]. So if [low1, high1] is + * actually the smaller range, we should swap it with [low2, high2] + */ + + if (high2 - low2 > high1 - low1) { + swap_indices(low1, low2); + swap_indices(high1, high2); + } + if (high2 < low2) { + /* smaller range is empty */ + memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1)); + return; + } + if (high2 - low2 < bots_app_cutoff_value ) { + seqmerge(low1, high1, low2, high2, lowdest); + return; + } + /* + * Basic approach: Find the middle element of one range (indexed by + * split1). Find where this element would fit in the other range + * (indexed by split 2). Then merge the two lower halves and the two + * upper halves. + */ + + split1 = ((high1 - low1 + 1) / 2) + low1; + split2 = binsplit(*split1, low2, high2); + lowsize = split1 - low1 + split2 - low2; + + /* + * directly put the splitting element into + * the appropriate location + */ + *(lowdest + lowsize + 1) = *split1; +#pragma omp task untied + cilkmerge_par(low1, split1 - 1, low2, split2, lowdest); +#pragma omp task untied + cilkmerge_par(split1 + 1, high1, split2 + 1, high2, + lowdest + lowsize + 2); +#pragma omp taskwait + + return; +} + +void cilksort_par(ELM *low, ELM *tmp, long size) +{ + /* + * divide the input in four parts of the same size (A, B, C, D) + * Then: + * 1) recursively sort A, B, C, and D (in parallel) + * 2) merge A and B into tmp1, and C and D into tmp2 (in parallel) + * 3) merge tmp1 and tmp2 into the original array + */ + long quarter = size / 4; + ELM *A, *B, *C, *D, *tmpA, *tmpB, *tmpC, *tmpD; + + if (size < bots_app_cutoff_value_1 ) { + /* quicksort when less than 1024 elements */ + seqquick(low, low + size - 1); + return; + } + A = low; + tmpA = tmp; + B = A + quarter; + tmpB = tmpA + quarter; + C = B + quarter; + tmpC = tmpB + quarter; + D = C + quarter; + tmpD = tmpC + quarter; + +#pragma omp task untied + cilksort_par(A, tmpA, quarter); +#pragma omp task untied + cilksort_par(B, tmpB, quarter); +#pragma omp task untied + cilksort_par(C, tmpC, quarter); +#pragma omp task untied + cilksort_par(D, tmpD, size - 3 * quarter); +#pragma omp taskwait + +#pragma omp task untied + cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA); +#pragma omp task untied + cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC); +#pragma omp taskwait + + cilkmerge_par(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A); +} + +void scramble_array( ELM *array ) +{ + unsigned long i; + unsigned long j; + + for (i = 0; i < bots_arg_size; ++i) { + j = my_rand(); + j = j % bots_arg_size; + swap(array[i], array[j]); + } +} + +void fill_array( ELM *array ) +{ + unsigned long i; + + my_srand(1); + /* first, fill with integers 1..size */ + for (i = 0; i < bots_arg_size; ++i) { + array[i] = i; + } +} + +void sort_init ( void ) +{ + /* Checking arguments */ + if (bots_arg_size < 4) { + bots_message("%s can not be less than 4, using 4 as a parameter.\n", BOTS_APP_DESC_ARG_SIZE ); + bots_arg_size = 4; + } + + if (bots_app_cutoff_value < 2) { + bots_message("%s can not be less than 2, using 2 as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF); + bots_app_cutoff_value = 2; + } + else if (bots_app_cutoff_value > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF, bots_arg_size); + bots_app_cutoff_value = bots_arg_size; + } + + if (bots_app_cutoff_value_1 > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_1, bots_arg_size); + bots_app_cutoff_value_1 = bots_arg_size; + } + if (bots_app_cutoff_value_2 > bots_arg_size ) { + bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_2, bots_arg_size); + bots_app_cutoff_value_2 = bots_arg_size; + } + + if (bots_app_cutoff_value_2 > bots_app_cutoff_value_1) { + bots_message("%s can not be greather than %s, using %d as a parameter.\n", + BOTS_APP_DESC_ARG_CUTOFF_2, + BOTS_APP_DESC_ARG_CUTOFF_1, + bots_app_cutoff_value_1 + ); + bots_app_cutoff_value_2 = bots_app_cutoff_value_1; + } + + array = (ELM *) malloc(bots_arg_size * sizeof(ELM)); + tmp = (ELM *) malloc(bots_arg_size * sizeof(ELM)); + fill_array(array); + scramble_array(array); +} + +void sort_par ( void ) +{ + bots_message("Computing multisort algorithm (n=%d) - It might take a while...\n", bots_arg_size); + + cilksort_par(array, tmp, bots_arg_size); + + bots_message("Multisort execution has finished\n"); +} + +int sort_verify ( void ) +{ + int i, success = 1; + for (i = 0; i < bots_arg_size; ++i) + if (array[i] != i) + success = 0; + + return success ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL; +} + diff --git a/ompss/strassen/Makefile b/ompss/strassen/Makefile new file mode 100644 index 0000000..a4923b1 --- /dev/null +++ b/ompss/strassen/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +#LIBS = +#PROGRAM_OBJS= + +CUTOFF_VERSIONS = manual if_clause +TIED_VERSIONS = yes + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/strassen/app-desc.h b/ompss/strassen/app-desc.h new file mode 100644 index 0000000..267cc9a --- /dev/null +++ b/ompss/strassen/app-desc.h @@ -0,0 +1,77 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ + +#include "ompss-app.h" + +#define BOTS_APP_NAME "Strassen" +#define BOTS_APP_PARAMETERS_DESC "N=%d:Y=%d" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value + +#define BOTS_APP_USES_ARG_SIZE +#define BOTS_APP_DEF_ARG_SIZE 1024 +#define BOTS_APP_DESC_ARG_SIZE "Matrix Size" + +#define BOTS_APP_USES_ARG_BLOCK +#define BOTS_APP_DEF_ARG_BLOCK 32 +#define BOTS_APP_DESC_ARG_BLOCK "Matrix Block Size" + +/* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */ +#define BOTS_APP_USES_ARG_CUTOFF +#define BOTS_APP_DEF_ARG_CUTOFF 64 +#define BOTS_APP_DESC_ARG_CUTOFF "Strassen Cutoff" + +/* Task creation cut off */ +#define BOTS_CUTOFF_DEF_VALUE 3 + +/*********************************************************************** + * The real numbers we are using --- either double or float + **********************************************************************/ +typedef double REAL; +typedef unsigned long PTR; +void init_matrix(int n, REAL *A, int an); +void strassen_main_par(REAL *A, REAL *B, REAL *C, int n); +void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n); +int compare_matrix(int n, REAL *A, int an, REAL *B, int bn); + +#define BOTS_APP_INIT\ + double *A, *B, *C, *D;\ + if ((bots_arg_size & (bots_arg_size - 1)) != 0 || (bots_arg_size % 16) != 0) {\ + bots_message("Error: matrix size (%d) must be a power of 2 and a multiple of %d\n", bots_arg_size, 16);\ + exit (1);\ + }\ + A = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\ + B = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\ + C = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\ + D = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\ + init_matrix(bots_arg_size,A,bots_arg_size);\ + init_matrix(bots_arg_size,B,bots_arg_size); + +//#define KERNEL_INIT +#define KERNEL_CALL strassen_main_par(C,A,B,bots_arg_size); +//#define KERNEL_FINI + +//#define KERNEL_SEQ_INIT +#define KERNEL_SEQ_CALL strassen_main_seq(D,A,B,bots_arg_size); +//#define KERNEL_SEQ_FINI + +#define BOTS_APP_CHECK_USES_SEQ_RESULT +#define KERNEL_CHECK compare_matrix(bots_arg_size,C,bots_arg_size,D,bots_arg_size); + + diff --git a/ompss/strassen/strassen.c b/ompss/strassen/strassen.c new file mode 100644 index 0000000..27da6f6 --- /dev/null +++ b/ompss/strassen/strassen.c @@ -0,0 +1,1279 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/**********************************************************************************************/ + +/* + * Copyright (c) 1996 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use, copy, modify, and distribute the Software without + * restriction, provided the Software, including any modified copies made + * under this license, is not distributed for a fee, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE + * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * /WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Except as contained in this notice, the name of the Massachusetts + * Institute of Technology shall not be used in advertising or otherwise + * to promote the sale, use or other dealings in this Software without + * prior written authorization from the Massachusetts Institute of + * Technology. + * + */ + +#include +#include +#include +#include "app-desc.h" +#include "bots.h" +#include "strassen.h" + +/*********************************************************************** + * Naive sequential algorithm, for comparison purposes + **********************************************************************/ +void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) +{ + int i, j, k; + REAL s; + + for (i = 0; i < n; ++i) + { + for (j = 0; j < n; ++j) + { + s = 0.0; + for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j); + ELEM(C, cn, i, j) = s; + } + } +} +/***************************************************************************** +** +** FastNaiveMatrixMultiply +** +** For small to medium sized matrices A, B, and C of size +** MatrixSize * MatrixSize this function performs the operation +** C = A x B efficiently. +** +** Note MatrixSize must be divisible by 8. +** +** INPUT: +** C = (*C WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) +** +*****************************************************************************/ +void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + REAL FirstARowValue = *ARowStart++; + + REAL Sum0 = FirstARowValue * (*BColumnStart); + REAL Sum1 = FirstARowValue * (*(BColumnStart+1)); + REAL Sum2 = FirstARowValue * (*(BColumnStart+2)); + REAL Sum3 = FirstARowValue * (*(BColumnStart+3)); + REAL Sum4 = FirstARowValue * (*(BColumnStart+4)); + REAL Sum5 = FirstARowValue * (*(BColumnStart+5)); + REAL Sum6 = FirstARowValue * (*(BColumnStart+6)); + REAL Sum7 = FirstARowValue * (*(BColumnStart+7)); + + unsigned Products; + for (Products = 1; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} +/***************************************************************************** +** +** FastAdditiveNaiveMatrixMultiply +** +** For small to medium sized matrices A, B, and C of size +** MatrixSize * MatrixSize this function performs the operation +** C += A x B efficiently. +** +** Note MatrixSize must be divisible by 8. +** +** INPUT: +** C = (*C READ/WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C READ/WRITE) Matrix C contains C + A x B. +** +*****************************************************************************/ +void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) +{ + /* Assumes size of real is 8 bytes */ + PTR RowWidthBInBytes = RowWidthB << 3; + PTR RowWidthAInBytes = RowWidthA << 3; + PTR MatrixWidthInBytes = MatrixSize << 3; + PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; + unsigned Horizontal, Vertical; + + REAL *ARowStart = A; + for (Vertical = 0; Vertical < MatrixSize; Vertical++) { + for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { + REAL *BColumnStart = B + Horizontal; + + REAL Sum0 = *C; + REAL Sum1 = *(C+1); + REAL Sum2 = *(C+2); + REAL Sum3 = *(C+3); + REAL Sum4 = *(C+4); + REAL Sum5 = *(C+5); + REAL Sum6 = *(C+6); + REAL Sum7 = *(C+7); + + unsigned Products; + for (Products = 0; Products < MatrixSize; Products++) { + REAL ARowValue = *ARowStart++; + + Sum0 += ARowValue * (*BColumnStart); + Sum1 += ARowValue * (*(BColumnStart+1)); + Sum2 += ARowValue * (*(BColumnStart+2)); + Sum3 += ARowValue * (*(BColumnStart+3)); + Sum4 += ARowValue * (*(BColumnStart+4)); + Sum5 += ARowValue * (*(BColumnStart+5)); + Sum6 += ARowValue * (*(BColumnStart+6)); + Sum7 += ARowValue * (*(BColumnStart+7)); + + BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); + + } + ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); + + *(C) = Sum0; + *(C+1) = Sum1; + *(C+2) = Sum2; + *(C+3) = Sum3; + *(C+4) = Sum4; + *(C+5) = Sum5; + *(C+6) = Sum6; + *(C+7) = Sum7; + C+=8; + } + + ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); + C = (REAL*) ( ((PTR) C) + RowIncrementC ); + } +} +/***************************************************************************** +** +** MultiplyByDivideAndConquer +** +** For medium to medium-large (would you like fries with that) sized +** matrices A, B, and C of size MatrixSize * MatrixSize this function +** efficiently performs the operation +** C = A x B (if AdditiveMode == 0) +** C += A x B (if AdditiveMode != 0) +** +** Note MatrixSize must be divisible by 16. +** +** INPUT: +** C = (*C READ/WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B +** +** OUTPUT: +** C (+)= A x B. (+ if AdditiveMode != 0) +** +*****************************************************************************/ +void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, + unsigned MatrixSize, + unsigned RowWidthC, + unsigned RowWidthA, + unsigned RowWidthB, + int AdditiveMode + ) +{ + #define A00 A + #define B00 B + #define C00 C + REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; + unsigned QuadrantSize = MatrixSize >> 1; + + /* partition the matrix */ + A01 = A00 + QuadrantSize; + A10 = A00 + RowWidthA * QuadrantSize; + A11 = A10 + QuadrantSize; + + B01 = B00 + QuadrantSize; + B10 = B00 + RowWidthB * QuadrantSize; + B11 = B10 + QuadrantSize; + + C01 = C00 + QuadrantSize; + C10 = C00 + RowWidthC * QuadrantSize; + C11 = C10 + QuadrantSize; + + if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { + + MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + AdditiveMode); + + MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB, + 1); + + } else { + + if (AdditiveMode) { + FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + } else { + + FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + + FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + + FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, + RowWidthC, RowWidthA, RowWidthB); + } + return; +} +/***************************************************************************** +** +** OptimizedStrassenMultiply +** +** For large matrices A, B, and C of size MatrixSize * MatrixSize this +** function performs the operation C = A x B efficiently. +** +** INPUT: +** C = (*C WRITE) Address of top left element of matrix C. +** A = (*A IS READ ONLY) Address of top left element of matrix A. +** B = (*B IS READ ONLY) Address of top left element of matrix B. +** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) +** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] +** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] +** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] +** +** OUTPUT: +** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) +** +*****************************************************************************/ +void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + /* M2 = A11 x B11 */ + OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + OptimizedStrassenMultiply_seq(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#if defined(IF_CUTOFF) +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + /* M2 = A11 x B11 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied if (Depth < bots_cutoff_value) + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#elif defined(MANUAL_CUTOFF) +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + if (Depth < bots_cutoff_value) + { + /* M2 = A11 x B11 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + } + else + { + /* M2 = A11 x B11 */ + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + /* M5 = S1 * S5 */ + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of T1 = S2 x S6 + M2 */ + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of T2 = T1 + S3 x S7 */ + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + /* Step 1 of C11 = M2 + A12 * B21 */ + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + /* Step 1 of C21 = T2 - A22 * S8 */ + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + } + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#else +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) +{ + unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ + unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + + 32; + unsigned Column, Row; + + /************************************************************************ + ** For each matrix A, B, and C, we'll want pointers to each quandrant + ** in the matrix. These quandrants will be addressed as follows: + ** -- -- + ** | A11 A12 | + ** | | + ** | A21 A22 | + ** -- -- + ************************************************************************/ + REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, + *A21, *B21, *C21, *A22, *B22, *C22; + + REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; + #define T2sMULT C22 + #define NumberOfVariables 11 + + PTR TempMatrixOffset = 0; + PTR MatrixOffsetA = 0; + PTR MatrixOffsetB = 0; + + char *Heap; + void *StartHeap; + + /* Distance between the end of a matrix row and the start of the next row */ + PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; + PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; + PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; + + if (MatrixSize <= bots_app_cutoff_value) { + MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); + return; + } + + /* Initialize quandrant matrices */ + #define A11 A + #define B11 B + #define C11 C + A12 = A11 + QuadrantSize; + B12 = B11 + QuadrantSize; + C12 = C11 + QuadrantSize; + A21 = A + (RowWidthA * QuadrantSize); + B21 = B + (RowWidthB * QuadrantSize); + C21 = C + (RowWidthC * QuadrantSize); + A22 = A21 + QuadrantSize; + B22 = B21 + QuadrantSize; + C22 = C21 + QuadrantSize; + + /* Allocate Heap Space Here */ + StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables); + /* ensure that heap is on cache boundary */ + if ( ((PTR) Heap) & 31) + Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); + + /* Distribute the heap space over the variables */ + S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; + T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; + + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column++) { + + /*********************************************************** + ** Within this loop, the following holds for MatrixOffset: + ** MatrixOffset = (Row * RowWidth) + Column + ** (note: that the unit of the offset is number of reals) + ***********************************************************/ + /* Element of Global Matrix, such as A, B, C */ + #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) + #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) + #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) + + /* FIXME - may pay to expand these out - got higher speed-ups below */ + /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ + E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); + + /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ + E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); + + /* S3 = A11 - A21 */ + E(S3) = EA(A11) - EA(A21); + + /* S7 = B22 - B12 */ + E(S7) = EB(B22) - EB(B12); + + TempMatrixOffset += sizeof(REAL); + MatrixOffsetA += sizeof(REAL); + MatrixOffsetB += sizeof(REAL); + } /* end row loop*/ + + MatrixOffsetA += RowIncrementA; + MatrixOffsetB += RowIncrementB; + } /* end column loop */ + + /* M2 = A11 x B11 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); + + /* M5 = S1 * S5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T1 = S2 x S6 + M2 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of T2 = T1 + S3 x S7 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); + + /* Step 1 of C11 = M2 + A12 * B21 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); + + /* Step 1 of C12 = S4 x B22 + T1 + M5 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); + + /* Step 1 of C21 = T2 - A22 * S8 */ + #pragma omp task untied + OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); + + /********************************************** + ** Synchronization Point + **********************************************/ + #pragma omp taskwait + /*************************************************************************** + ** Step through all columns row by row (vertically) + ** (jumps in memory by RowWidth => bad locality) + ** (but we want the best locality on the innermost loop) + ***************************************************************************/ + for (Row = 0; Row < QuadrantSize; Row++) { + /************************************************************************* + ** Step through each row horizontally (addressing elements in each column) + ** (jumps linearly througn memory => good locality) + *************************************************************************/ + for (Column = 0; Column < QuadrantSize; Column += 4) { + REAL LocalM5_0 = *(M5); + REAL LocalM5_1 = *(M5+1); + REAL LocalM5_2 = *(M5+2); + REAL LocalM5_3 = *(M5+3); + REAL LocalM2_0 = *(M2); + REAL LocalM2_1 = *(M2+1); + REAL LocalM2_2 = *(M2+2); + REAL LocalM2_3 = *(M2+3); + REAL T1_0 = *(T1sMULT) + LocalM2_0; + REAL T1_1 = *(T1sMULT+1) + LocalM2_1; + REAL T1_2 = *(T1sMULT+2) + LocalM2_2; + REAL T1_3 = *(T1sMULT+3) + LocalM2_3; + REAL T2_0 = *(C22) + T1_0; + REAL T2_1 = *(C22+1) + T1_1; + REAL T2_2 = *(C22+2) + T1_2; + REAL T2_3 = *(C22+3) + T1_3; + (*(C11)) += LocalM2_0; + (*(C11+1)) += LocalM2_1; + (*(C11+2)) += LocalM2_2; + (*(C11+3)) += LocalM2_3; + (*(C12)) += LocalM5_0 + T1_0; + (*(C12+1)) += LocalM5_1 + T1_1; + (*(C12+2)) += LocalM5_2 + T1_2; + (*(C12+3)) += LocalM5_3 + T1_3; + (*(C22)) = LocalM5_0 + T2_0; + (*(C22+1)) = LocalM5_1 + T2_1; + (*(C22+2)) = LocalM5_2 + T2_2; + (*(C22+3)) = LocalM5_3 + T2_3; + (*(C21 )) = (- *(C21 )) + T2_0; + (*(C21+1)) = (- *(C21+1)) + T2_1; + (*(C21+2)) = (- *(C21+2)) + T2_2; + (*(C21+3)) = (- *(C21+3)) + T2_3; + M5 += 4; + M2 += 4; + T1sMULT += 4; + C11 += 4; + C12 += 4; + C21 += 4; + C22 += 4; + } + C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); + C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); + C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); + C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); + } + free(StartHeap); +} +#endif +/* + * Set an n by n matrix A to random values. The distance between + * rows is an + */ +void init_matrix(int n, REAL *A, int an) +{ + int i, j; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) + ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; +} + +/* + * Compare two matrices. Print an error message if they differ by + * more than EPSILON. + */ +int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) +{ + int i, j; + REAL c; + + for (i = 0; i < n; ++i) + for (j = 0; j < n; ++j) { + /* compute the relative error c */ + c = ELEM(A, an, i, j) - ELEM(B, bn, i, j); + if (c < 0.0) + c = -c; + + c = c / ELEM(A, an, i, j); + if (c > EPSILON) { + bots_message("Strassen: Wrong answer!\n"); + return BOTS_RESULT_UNSUCCESSFUL; + } + } + + return BOTS_RESULT_SUCCESSFUL; +} + +/* + * Allocate a matrix of side n (therefore n^2 elements) + */ +REAL *alloc_matrix(int n) +{ + return malloc(n * n * sizeof(REAL)); +} + +void strassen_main_par(REAL *A, REAL *B, REAL *C, int n) +{ + bots_message("Computing parallel Strassen algorithm (n=%d) ", n); + OptimizedStrassenMultiply_par(C, A, B, n, n, n, n, 1); + bots_message(" completed!\n"); +} +void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n) +{ + bots_message("Computing sequential Strassen algorithm (n=%d) ", n); + OptimizedStrassenMultiply_seq(C, A, B, n, n, n, n, 1); + bots_message(" completed!\n"); +} + diff --git a/ompss/strassen/strassen.h b/ompss/strassen/strassen.h new file mode 100644 index 0000000..7944f77 --- /dev/null +++ b/ompss/strassen/strassen.h @@ -0,0 +1,66 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#ifndef _STRASSEN_H +#define _STRASSEN_H +/* ******************************************************************* */ +/* STRASSEN APPLICATION CUT OFF's */ +/* ******************************************************************* */ +/* Strassen uses three different functions to compute Matrix Multiply. */ +/* Each of them is related to an application cut off value: */ +/* - Initial algorithm: OptimizedStrassenMultiply() */ +/* - bots_app_cutoff_value: MultiplyByDivideAndConquer() */ +/* - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */ +/* ******************************************************************* */ + +/*FIXME: at the moment we use a constant value, change to parameter ???*/ +/* Below this cut off strassen uses FastAdditiveNaiveMatrixMultiply algorithm */ +#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16 + +/*********************************************************************** + * maximum tolerable relative error (for the checking routine) + **********************************************************************/ +#define EPSILON (1.0E-6) +/*********************************************************************** + * Matrices are stored in row-major order; A is a pointer to + * the first element of the matrix, and an is the number of elements + * between two rows. This macro produces the element A[i,j] + * given A, an, i and j + **********************************************************************/ +#define ELEM(A, an, i, j) (A[(i)*(an)+(j)]) + +void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn); +void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); +void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); +void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, + unsigned MatrixSize, + unsigned RowWidthC, + unsigned RowWidthA, + unsigned RowWidthB, + int AdditiveMode + ); +void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth); +void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, + unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth); +REAL *alloc_matrix(int n); +#endif + diff --git a/ompss/uts/Makefile b/ompss/uts/Makefile new file mode 100644 index 0000000..f464268 --- /dev/null +++ b/ompss/uts/Makefile @@ -0,0 +1,36 @@ +############################################################################################## +# This program is part of the Barcelona OpenMP Tasks Suite # +# Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion # +# Copyright (C) 2009 Universitat Politecnica de Catalunya # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # +############################################################################################## + +LIBS = -lm +PROGRAM_OBJS=uts.o brg_sha1.o + +#CUTOFF_VERSIONS = manual if_clause +TIED_VERSIONS = YES + +BASE_DIR = ../../ + +# +# Don't change below here +# + +include ../Makefile.version +include $(BASE_DIR)/common/Makefile.common + + diff --git a/ompss/uts/app-desc.h b/ompss/uts/app-desc.h new file mode 100644 index 0000000..9cfe6a6 --- /dev/null +++ b/ompss/uts/app-desc.h @@ -0,0 +1,45 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +/**********************************************************************************************/ +#include "ompss-app.h" +#include "uts.h" + +#define BOTS_APP_NAME "Unbalance Tree Search" +#define BOTS_APP_PARAMETERS_DESC "%s" +#define BOTS_APP_PARAMETERS_LIST ,bots_arg_file + +#define BOTS_APP_USES_ARG_FILE +#define BOTS_APP_DEF_ARG_FILE "Input filename" +#define BOTS_APP_DESC_ARG_FILE "UTS input file (mandatory)" + +#define BOTS_APP_INIT \ + Node root; \ + uts_read_file(bots_arg_file); + +#define KERNEL_INIT uts_initRoot(&root); + +unsigned long long parallel_uts ( Node *); + +#define KERNEL_CALL bots_number_of_tasks = parallel_uts(&root); + +#define KERNEL_FINI uts_show_stats(); + +#define KERNEL_CHECK uts_check_result(); + + diff --git a/ompss/uts/brg_endian.h b/ompss/uts/brg_endian.h new file mode 100644 index 0000000..302112f --- /dev/null +++ b/ompss/uts/brg_endian.h @@ -0,0 +1,141 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 20/10/2006 +*/ + +#ifndef BRG_ENDIAN_H +#define BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/ompss/uts/brg_sha1.c b/ompss/uts/brg_sha1.c new file mode 100644 index 0000000..bef3d14 --- /dev/null +++ b/ompss/uts/brg_sha1.c @@ -0,0 +1,341 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 + + This is a byte oriented version of SHA1 that operates on arrays of bytes + stored in memory. +*/ + +#include /* for memcpy() etc. */ +#include + +#include "brg_sha1.h" +#include "brg_endian.h" +#include "bots.h" + +#if defined(__cplusplus) +extern "C" +{ +#endif + +/** BEGIN: UTS RNG Harness **/ + +void rng_init(RNG_state *newstate, int seed) +{ + struct sha1_context ctx; + struct state_t gen; + int i; + + for (i=0; i < 16; i++) + gen.state[i] = 0; + gen.state[16] = (u_int8_t) (0xFF & (seed >> 24)); + gen.state[17] = (u_int8_t) (0xFF & (seed >> 16)); + gen.state[18] = (u_int8_t) (0xFF & (seed >> 8)); + gen.state[19] = (u_int8_t) (0xFF & (seed >> 0)); + + sha1_begin(&ctx); + sha1_hash(gen.state, 20, &ctx); + sha1_end(newstate, &ctx); +} + +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber) +{ + struct sha1_context ctx; + u_int8_t bytes[4]; + + bytes[0] = (u_int8_t) (0xFF & (spawnnumber >> 24)); + bytes[1] = (u_int8_t) (0xFF & (spawnnumber >> 16)); + bytes[2] = (u_int8_t) (0xFF & (spawnnumber >> 8)); + bytes[3] = (u_int8_t) (0xFF & spawnnumber); + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_hash(bytes, 4, &ctx); + sha1_end(newstate, &ctx); +} + +int rng_rand(RNG_state *mystate){ + int r; + uint32 b = (mystate[16] << 24) | (mystate[17] << 16) + | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int) b; + bots_debug("b: %d\t, r: %d\n", b, r); + return r; +} + +int rng_nextrand(RNG_state *mystate){ + struct sha1_context ctx; + int r; + uint32 b; + + sha1_begin(&ctx); + sha1_hash(mystate, 20, &ctx); + sha1_end(mystate, &ctx); + b = (mystate[16] << 24) | (mystate[17] << 16) + | (mystate[18] << 8) | (mystate[19] << 0); + b = b & POS_MASK; + + r = (int) b; + return r; +} + +/* condense state into string to display during debugging */ +char * rng_showstate(RNG_state *state, char *s){ + sprintf(s,"%.2X%.2X...", state[0],state[1]); + return s; +} + +/* describe random number generator type into string */ +void rng_showtype( void ) { + bots_message("SHA-1 (state size = %luB)\n", sizeof(struct state_t)); +} + +/** END: UTS RNG Harness **/ + +#if defined( _MSC_VER ) && ( _MSC_VER > 800 ) +#pragma intrinsic(memcpy) +#endif + +#if 0 && defined(_MSC_VER) +#define rotl32 _lrotl +#define rotr32 _lrotr +#else +#define rotl32(x,n) (((x) << n) | ((x) >> (32 - n))) +#define rotr32(x,n) (((x) >> n) | ((x) << (32 - n))) +#endif + +#if !defined(bswap_32) +#define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00)) +#endif + +#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define SWAP_BYTES +#else +#undef SWAP_BYTES +#endif + +#if defined(SWAP_BYTES) +#define bsw_32(p,n) \ + { int _i = (n); while(_i--) ((uint_32t*)p)[_i] = bswap_32(((uint_32t*)p)[_i]); } +#else +#define bsw_32(p,n) +#endif + +#define SHA1_MASK (SHA1_BLOCK_SIZE - 1) + +#if 0 + +#define ch(x,y,z) (((x) & (y)) ^ (~(x) & (z))) +#define parity(x,y,z) ((x) ^ (y) ^ (z)) +#define maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +#else /* Discovered by Rich Schroeppel and Colin Plumb */ + +#define ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) +#define parity(x,y,z) ((x) ^ (y) ^ (z)) +#define maj(x,y,z) (((x) & (y)) | ((z) & ((x) ^ (y)))) + +#endif + +/* Compile 64 bytes of hash data into SHA1 context. Note */ +/* that this routine assumes that the byte order in the */ +/* ctx->wbuf[] at this point is in such an order that low */ +/* address bytes in the ORIGINAL byte stream will go in */ +/* this buffer to the high end of 32-bit words on BOTH big */ +/* and little endian systems */ + +#ifdef ARRAY +#define q(v,n) v[n] +#else +#define q(v,n) v##n +#endif + +#define one_cycle(v,a,b,c,d,e,f,k,h) \ + q(v,e) += rotr32(q(v,a),27) + \ + f(q(v,b),q(v,c),q(v,d)) + k + h; \ + q(v,b) = rotr32(q(v,b), 2) + +#define five_cycle(v,f,k,i) \ + one_cycle(v, 0,1,2,3,4, f,k,hf(i )); \ + one_cycle(v, 4,0,1,2,3, f,k,hf(i+1)); \ + one_cycle(v, 3,4,0,1,2, f,k,hf(i+2)); \ + one_cycle(v, 2,3,4,0,1, f,k,hf(i+3)); \ + one_cycle(v, 1,2,3,4,0, f,k,hf(i+4)) + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]) +{ uint_32t *w = ctx->wbuf; + +#ifdef ARRAY + uint_32t v[5]; + memcpy(v, ctx->hash, 5 * sizeof(uint_32t)); +#else + uint_32t v0, v1, v2, v3, v4; + v0 = ctx->hash[0]; v1 = ctx->hash[1]; + v2 = ctx->hash[2]; v3 = ctx->hash[3]; + v4 = ctx->hash[4]; +#endif + +#define hf(i) w[i] + + five_cycle(v, ch, 0x5a827999, 0); + five_cycle(v, ch, 0x5a827999, 5); + five_cycle(v, ch, 0x5a827999, 10); + one_cycle(v,0,1,2,3,4, ch, 0x5a827999, hf(15)); \ + +#undef hf +#define hf(i) (w[(i) & 15] = rotl32( \ + w[((i) + 13) & 15] ^ w[((i) + 8) & 15] \ + ^ w[((i) + 2) & 15] ^ w[(i) & 15], 1)) + + one_cycle(v,4,0,1,2,3, ch, 0x5a827999, hf(16)); + one_cycle(v,3,4,0,1,2, ch, 0x5a827999, hf(17)); + one_cycle(v,2,3,4,0,1, ch, 0x5a827999, hf(18)); + one_cycle(v,1,2,3,4,0, ch, 0x5a827999, hf(19)); + + five_cycle(v, parity, 0x6ed9eba1, 20); + five_cycle(v, parity, 0x6ed9eba1, 25); + five_cycle(v, parity, 0x6ed9eba1, 30); + five_cycle(v, parity, 0x6ed9eba1, 35); + + five_cycle(v, maj, 0x8f1bbcdc, 40); + five_cycle(v, maj, 0x8f1bbcdc, 45); + five_cycle(v, maj, 0x8f1bbcdc, 50); + five_cycle(v, maj, 0x8f1bbcdc, 55); + + five_cycle(v, parity, 0xca62c1d6, 60); + five_cycle(v, parity, 0xca62c1d6, 65); + five_cycle(v, parity, 0xca62c1d6, 70); + five_cycle(v, parity, 0xca62c1d6, 75); + +#ifdef ARRAY + ctx->hash[0] += v[0]; ctx->hash[1] += v[1]; + ctx->hash[2] += v[2]; ctx->hash[3] += v[3]; + ctx->hash[4] += v[4]; +#else + ctx->hash[0] += v0; ctx->hash[1] += v1; + ctx->hash[2] += v2; ctx->hash[3] += v3; + ctx->hash[4] += v4; +#endif +} + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]) +{ + ctx->count[0] = ctx->count[1] = 0; + ctx->hash[0] = 0x67452301; + ctx->hash[1] = 0xefcdab89; + ctx->hash[2] = 0x98badcfe; + ctx->hash[3] = 0x10325476; + ctx->hash[4] = 0xc3d2e1f0; +} + +/* SHA1 hash data in an array of bytes into hash buffer and */ +/* call the hash_compile function as required. */ + +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]) +{ uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), + space = SHA1_BLOCK_SIZE - pos; + const unsigned char *sp = data; + + if((ctx->count[0] += len) < len) + ++(ctx->count[1]); + + while(len >= space) /* tranfer whole blocks if possible */ + { + memcpy(((unsigned char*)ctx->wbuf) + pos, sp, space); + sp += space; len -= space; space = SHA1_BLOCK_SIZE; pos = 0; + bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2); + sha1_compile(ctx); + } + + memcpy(((unsigned char*)ctx->wbuf) + pos, sp, len); +} + +/* SHA1 final padding and digest calculation */ + +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]) +{ uint_32t i = (uint_32t)(ctx->count[0] & SHA1_MASK); + + /* put bytes in the buffer in an order in which references to */ + /* 32-bit words will put bytes with lower addresses into the */ + /* top of 32 bit words on BOTH big and little endian machines */ + bsw_32(ctx->wbuf, (i + 3) >> 2); + + /* we now need to mask valid bytes and add the padding which is */ + /* a single 1 bit and as many zero bits as necessary. Note that */ + /* we can always add the first padding byte here because the */ + /* buffer always has at least one empty slot */ + ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3); + ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3); + + /* we need 9 or more empty positions, one for the padding byte */ + /* (above) and eight for the length count. If there is not */ + /* enough space, pad and empty the buffer */ + if(i > SHA1_BLOCK_SIZE - 9) + { + if(i < 60) ctx->wbuf[15] = 0; + sha1_compile(ctx); + i = 0; + } + else /* compute a word index for the empty buffer positions */ + i = (i >> 2) + 1; + + while(i < 14) /* and zero pad all but last two positions */ + ctx->wbuf[i++] = 0; + + /* the following 32-bit length fields are assembled in the */ + /* wrong byte order on little endian machines but this is */ + /* corrected later since they are only ever used as 32-bit */ + /* word values. */ + ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29); + ctx->wbuf[15] = ctx->count[0] << 3; + sha1_compile(ctx); + + /* extract the hash value as bytes in case the hash buffer is */ + /* misaligned for 32-bit words */ + for(i = 0; i < SHA1_DIGEST_SIZE; ++i) + hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3))); +} + +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len) +{ sha1_ctx cx[1]; + + sha1_begin(cx); sha1_hash(data, len, cx); sha1_end(hval, cx); +} + +#if defined(__cplusplus) +} +#endif diff --git a/ompss/uts/brg_sha1.h b/ompss/uts/brg_sha1.h new file mode 100644 index 0000000..31e2aa5 --- /dev/null +++ b/ompss/uts/brg_sha1.h @@ -0,0 +1,109 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ +/* + --------------------------------------------------------------------------- + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 01/08/2005 +*/ + +#ifndef _SHA1_H +#define _SHA1_H + +#include +#include "brg_types.h" + +#define SHA1_BLOCK_SIZE 64 +#define SHA1_DIGEST_SIZE 20 + +#if defined(__cplusplus) +extern "C" +{ +#endif + +/** BEGIN: UTS RNG Harness **/ + +#define POS_MASK 0x7fffffff +#define HIGH_BITS 0x80000000 + +#define sha1_context sha1_ctx_s +typedef u_int8_t RNG_state; +typedef u_int32_t uint32; +//typedef char * caddr_t; + +/**********************************/ +/* random number generator state */ +/**********************************/ +struct state_t { + u_int8_t state[20]; +}; + + +/***************************************/ +/* random number generator operations */ +/***************************************/ +void rng_init(RNG_state *state, int seed); +void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber); +int rng_rand(RNG_state *mystate); +int rng_nextrand(RNG_state *mystate); +char * rng_showstate(RNG_state *state, char *s); +void rng_showtype( void ); + +/** END: UTS RNG Harness **/ +/* type to hold the SHA256 context */ + +struct sha1_ctx_s +{ uint_32t count[2]; + uint_32t hash[5]; + uint_32t wbuf[16]; +}; + +typedef struct sha1_ctx_s sha1_ctx; + +/* Note that these prototypes are the same for both bit and */ +/* byte oriented implementations. However the length fields */ +/* are in bytes or bits as appropriate for the version used */ +/* and bit sequences are input as arrays of bytes in which */ +/* bit sequences run from the most to the least significant */ +/* end of each byte */ + +VOID_RETURN sha1_compile(sha1_ctx ctx[1]); + +VOID_RETURN sha1_begin(sha1_ctx ctx[1]); +VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]); +VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]); +VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ompss/uts/brg_types.h b/ompss/uts/brg_types.h new file mode 100644 index 0000000..ae9f717 --- /dev/null +++ b/ompss/uts/brg_types.h @@ -0,0 +1,205 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 09/09/2006 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef BRG_TYPES_H +#define BRG_TYPES_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +/* Try this if you you get an error from one of the typedefs below */ +#ifdef BRG_STD_TYPES +# define BRG_UI8 + typedef u_int8_t uint_8t; +# define BRG_UI16 + typedef u_int16_t uint_16t; +# define BRG_UI32 +# define li_32(h) 0x##h##u + typedef u_int32_t uint_32t; +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef u_int64_t uint_64t; +#endif /* BRG_C99_TYPES */ + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# endif +#endif + +#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 ) +# error Please define uint_64t as an unsigned 64 bit type in brg_types.h +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8 + + dec_unit_type(size,x) declares a variable 'x' of length + 'size' bits + + dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + ptr_cast(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define ui_type(size) uint_##size##t +#define dec_unit_type(size,x) typedef ui_type(size) x +#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x,size) ((ui_type(size)*)(x)) + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ompss/uts/uts.c b/ompss/uts/uts.c new file mode 100644 index 0000000..a380315 --- /dev/null +++ b/ompss/uts/uts.c @@ -0,0 +1,279 @@ +/**********************************************************************************************/ +/* This program is part of the Barcelona OpenMP Tasks Suite */ +/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ +/* Copyright (C) 2009 Universitat Politecnica de Catalunya */ +/**********************************************************************************************/ +/* + * Copyright (c) 2007 The Unbalanced Tree Search (UTS) Project Team: + * ----------------------------------------------------------------- + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. + * + * University of Maryland: + * Chau-Wen Tseng(1) + * + * University of North Carolina, Chapel Hill: + * Jun Huan + * + * The Ohio State University: + * James Dinan + * + * Supercomputing Research Center + * D. Pryor + * + * (1) - indicates project PI + * + * UTS Recursive Depth-First Search (DFS) version developed by James Dinan + * + * Adapted for OpenMP 3.0 Task-based version by Stephen Olivier + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include + +#include "app-desc.h" +#include "bots.h" +#include "uts.h" + +/*********************************************************** + * Global state * + ***********************************************************/ +unsigned long long nLeaves = 0; +int maxTreeDepth = 0; +/*********************************************************** + * Tree generation strategy is controlled via various * + * parameters set from the command line. The parameters * + * and their default values are given below. * + * Trees are generated using a Galton-Watson process, in * + * which the branching factor of each node is a random * + * variable. * + * * + * The random variable follow a binomial distribution. * + ***********************************************************/ +double b_0 = 4.0; // default branching factor at the root +int rootId = 0; // default seed for RNG state at root +/*********************************************************** + * The branching factor at the root is specified by b_0. + * The branching factor below the root follows an + * identical binomial distribution at all nodes. + * A node has m children with prob q, or no children with + * prob (1-q). The expected branching factor is q * m. + * + * Default parameter values + ***********************************************************/ +int nonLeafBF = 4; // m +double nonLeafProb = 15.0 / 64.0; // q +/*********************************************************** + * compute granularity - number of rng evaluations per + * tree node + ***********************************************************/ +int computeGranularity = 1; +/*********************************************************** + * expected results for execution + ***********************************************************/ +unsigned long long exp_tree_size = 0; +int exp_tree_depth = 0; +unsigned long long exp_num_leaves = 0; +/*********************************************************** + * FUNCTIONS * + ***********************************************************/ + +// Interpret 32 bit positive integer as value on [0,1) +double rng_toProb(int n) +{ + if (n < 0) { + printf("*** toProb: rand n = %d out of range\n",n); + } + return ((n<0)? 0.0 : ((double) n)/2147483648.0); +} + +void uts_initRoot(Node * root) +{ + root->height = 0; + root->numChildren = -1; // means not yet determined + rng_init(root->state.state, rootId); + + bots_message("Root node at %p\n", root); +} + + +int uts_numChildren_bin(Node * parent) +{ + // distribution is identical everywhere below root + int v = rng_rand(parent->state.state); + double d = rng_toProb(v); + + return (d < nonLeafProb) ? nonLeafBF : 0; +} + +int uts_numChildren(Node *parent) +{ + int numChildren = 0; + + /* Determine the number of children */ + if (parent->height == 0) numChildren = (int) floor(b_0); + else numChildren = uts_numChildren_bin(parent); + + // limit number of children + // only a BIN root can have more than MAXNUMCHILDREN + if (parent->height == 0) { + int rootBF = (int) ceil(b_0); + if (numChildren > rootBF) { + bots_debug("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF); + numChildren = rootBF; + } + } + else { + if (numChildren > MAXNUMCHILDREN) { + bots_debug("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN); + numChildren = MAXNUMCHILDREN; + } + } + + return numChildren; +} + +/*********************************************************** + * Recursive depth-first implementation * + ***********************************************************/ + +unsigned long long parallel_uts ( Node *root ) +{ + unsigned long long num_nodes = 0 ; + root->numChildren = uts_numChildren(root); + + bots_message("Computing Unbalance Tree Search algorithm "); + + num_nodes = parTreeSearch( 0, root, root->numChildren ); + + bots_message(" completed!"); + + return num_nodes; +} + +unsigned long long parTreeSearch(int depth, Node *parent, int numChildren) +{ + Node n[numChildren], *nodePtr; + int i, j; + unsigned long long subtreesize = 1, partialCount[numChildren]; + + // Recurse on the children + for (i = 0; i < numChildren; i++) { + nodePtr = &n[i]; + + nodePtr->height = parent->height + 1; + + // The following line is the work (one or more SHA-1 ops) + for (j = 0; j < computeGranularity; j++) { + rng_spawn(parent->state.state, nodePtr->state.state, i); + } + + nodePtr->numChildren = uts_numChildren(nodePtr); + + #pragma omp task untied firstprivate(i, nodePtr) shared(partialCount) + partialCount[i] = parTreeSearch(depth+1, nodePtr, nodePtr->numChildren); + } + + #pragma omp taskwait + + for (i = 0; i < numChildren; i++) { + subtreesize += partialCount[i]; + } + + return subtreesize; +} + +void uts_read_file ( char *filename ) +{ + FILE *fin; + + if ((fin = fopen(filename, "r")) == NULL) { + bots_message("Could not open input file (%s)\n", filename); + exit (-1); + } + fscanf(fin,"%lf %lf %d %d %d %llu %d %llu", + &b_0, + &nonLeafProb, + &nonLeafBF, + &rootId, + &computeGranularity, + &exp_tree_size, + &exp_tree_depth, + &exp_num_leaves + ); + fclose(fin); + + computeGranularity = max(1,computeGranularity); + + // Printing input data + bots_message("\n"); + bots_message("Root branching factor = %f\n", b_0); + bots_message("Root seed (0 <= 2^31) = %d\n", rootId); + bots_message("Probability of non-leaf node = %f\n", nonLeafProb); + bots_message("Number of children for non-leaf node = %d\n", nonLeafBF); + bots_message("E(n) = %f\n", (double) ( nonLeafProb * nonLeafBF ) ); + bots_message("E(s) = %f\n", (double) ( 1.0 / (1.0 - nonLeafProb * nonLeafBF) ) ); + bots_message("Compute granularity = %d\n", computeGranularity); + bots_message("Random number generator = "); rng_showtype(); +} + +void uts_show_stats( void ) +{ + int nPes = atoi(bots_resources); + int chunkSize = 0; + + bots_message("\n"); + bots_message("Tree size = %llu\n", (unsigned long long) bots_number_of_tasks ); + bots_message("Maximum tree depth = %d\n", maxTreeDepth ); + bots_message("Chunk size = %d\n", chunkSize ); + bots_message("Number of leaves = %llu (%.2f%%)\n", nLeaves, nLeaves/(float)bots_number_of_tasks*100.0 ); + bots_message("Number of PE's = %.4d threads\n", nPes ); + bots_message("Wallclock time = %.3f sec\n", bots_time_program ); + bots_message("Overall performance = %.0f nodes/sec\n", (bots_number_of_tasks / bots_time_program) ); + bots_message("Performance per PE = %.0f nodes/sec\n", (bots_number_of_tasks / bots_time_program / nPes) ); +} + +int uts_check_result ( void ) +{ + int answer = BOTS_RESULT_SUCCESSFUL; + + if ( bots_number_of_tasks != exp_tree_size ) { + answer = BOTS_RESULT_UNSUCCESSFUL; + bots_message("Incorrect tree size result (%llu instead of %llu).\n", bots_number_of_tasks, exp_tree_size); + } + + return answer; +} diff --git a/ompss/uts/uts.h b/ompss/uts/uts.h new file mode 100644 index 0000000..c913b24 --- /dev/null +++ b/ompss/uts/uts.h @@ -0,0 +1,81 @@ +/* + * ---- The Unbalanced Tree Search (UTS) Benchmark ---- + * + * This file is part of the unbalanced tree search benchmark. This + * project is licensed under the MIT Open Source license. See the LICENSE + * file for copyright and licensing information. + * + * UTS is a collaborative project between researchers at the University of + * Maryland, the University of North Carolina at Chapel Hill, and the Ohio + * State University. See AUTHORS file for more information. + * + * ** THIS IS A PRE-RELEASE VERSION OF UTS. ** + */ + +#ifndef _UTS_H +#define _UTS_H + +#include "brg_sha1.h" + +#define UTS_VERSION "2.1" + +/*********************************************************** + * Tree node descriptor and statistics * + ***********************************************************/ + +#define MAXNUMCHILDREN 100 // cap on children (BIN root is exempt) + +struct node_t { + int height; // depth of this node in the tree + int numChildren; // number of children, -1 => not yet determined + + /* for RNG state associated with this node */ + struct state_t state; +}; + +typedef struct node_t Node; + +/* Tree type + * Trees are generated using a Galton-Watson process, in + * which the branching factor of each node is a random + * variable. + * + * The random variable can follow a binomial distribution + * or a geometric distribution. Hybrid tree are + * generated with geometric distributions near the + * root and binomial distributions towards the leaves. + */ +/* Tree parameters */ +extern double b_0; +extern int rootId; +extern int nonLeafBF; +extern double nonLeafProb; + +/* Benchmark parameters */ +extern int computeGranularity; +extern int debug; +extern int verbose; + +/* Utility Functions */ +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#define min(a,b) (((a) < (b)) ? (a) : (b)) + +unsigned long long parTreeSearch(int depth, Node *parent, int numChildren); + +int uts_paramsToStr(char *strBuf, int ind); +void uts_read_file(char *file); +void uts_print_params(); + +double rng_toProb(int n); + +/* Common tree routines */ +void uts_initRoot(Node * root); +int uts_numChildren(Node *parent); +int uts_numChildren_bin(Node * parent); +int uts_numChildren_geo(Node * parent); +int uts_childType(Node *parent); + +void uts_show_stats( void ); +int uts_check_result ( void ); + +#endif /* _UTS_H */