## Reservation & deployment

In [1]:
from fabfile import *
from real_hpl import *

In [2]:
nb_nodes = 4
dahu = Job.oarsub_cluster(site='grenoble', username='tocornebize', clusters=['dahu'],    walltime=Time(hours=4), nb_nodes=nb_nodes, deploy='debian9-x64-base', queue='testing')
gr20 = Job.oarsub_cluster(site='nancy',    username='tocornebize', clusters=['grvingt'], walltime=Time(hours=4), nb_nodes=nb_nodes, deploy='debian9-x64-base', queue='production')

alljobs = [dahu, gr20]
for job in alljobs:
    print(job.hostnames)
    if len(job.hostnames) != nb_nodes:
        logger.critical('Bad number of nodes')

[32m[2018-07-20 14:34:46][INFO] [37m[frontend | /home/tocornebize] oarsub -n "☕" -q testing -t deploy  -l "{cluster in ('dahu')}/nodes=4,walltime=04:00:00" -r "2018-07-20 14:34:46"[0m
[32m[2018-07-20 14:34:49][INFO] [37m[frontend | /home/tocornebize] oarsub -n "☕" -q production -t deploy  -l "{cluster in ('grvingt')}/nodes=4,walltime=04:00:00" -r "2018-07-20 14:34:49"[0m
[32m[2018-07-20 14:35:12][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:35:17][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:35:28][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:35:50][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:36:38][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:37:45][INFO] [37m[frontend | /home/tocornebize] oarstat -fJ -j 1805726[0m
[32m[2018-07-20 14:38:53][INFO] [37m[front

['dahu-27.grenoble.grid5000.fr', 'dahu-28.grenoble.grid5000.fr', 'dahu-29.grenoble.grid5000.fr', 'dahu-8.grenoble.grid5000.fr']
['grvingt-5.nancy.grid5000.fr', 'grvingt-6.nancy.grid5000.fr', 'grvingt-7.nancy.grid5000.fr', 'grvingt-8.nancy.grid5000.fr']


In [3]:
for job in alljobs:
    job.kadeploy(job.deploy)

[32m[2018-07-20 14:39:05][INFO] [37m[frontend | /home/tocornebize] kadeploy3 -k -f /var/lib/oar/1805726 -e debian9-x64-base[0m
[32m[2018-07-20 14:42:46][INFO] [37m[frontend | /home/tocornebize] kadeploy3 -k -f /var/lib/oar/1617657 -e debian9-x64-base[0m


In [4]:
for job in alljobs:
    install(job)

[32m[2018-07-20 14:46:43][INFO] [37m[allnodes | /tmp] echo "hello world"[0m
[32m[2018-07-20 14:46:43][INFO] [37m[allnodes | /tmp] apt update && DEBIAN_FRONTEND=noninteractive apt upgrade -yq[0m
[32m[2018-07-20 14:47:32][INFO] [37m[allnodes | /tmp] DEBIAN_FRONTEND=noninteractive apt install -y build-essential zip make git time hwloc pciutils cmake cpufrequtils linux-cpupower openmpi-bin libopenmpi-dev net-tools[0m
[32m[2018-07-20 14:47:56][INFO] [37m[allnodes | /tmp] wget https://github.com/xianyi/OpenBLAS/archive/v0.3.1.zip -O openblas.zip[0m
[32m[2018-07-20 14:48:13][INFO] [37m[allnodes | /tmp] unzip openblas.zip && mv OpenBLAS-* openblas[0m
[32m[2018-07-20 14:48:19][INFO] [37m[allnodes | /tmp/openblas] make -j 64[0m
[32m[2018-07-20 14:48:27][INFO] [37m[allnodes | /tmp/openblas] make install PREFIX=/tmp[0m
[32m[2018-07-20 14:48:27][INFO] [37m[allnodes | /tmp] wget http://www.netlib.org/benchmark/hpl/hpl-2.2.tar.gz[0m
[32m[2018-07-20 14:48:28][INFO] [37m[allno

## Performance tuning, estimation of the peak

We disable the hyperthreading and the DVFS.

In [5]:
for job in alljobs:
    job.nodes.disable_hyperthreading()
#job.nodes.set_frequency_performance() # ← fail on Dahu, no cpufreq driver
#print(job.nodes.frequency_information)
#print(job.nodes.current_frequency_information)

[32m[2018-07-20 14:51:59][INFO] [37m[allnodes | /tmp] lstopo topology.xml && cat topology.xml[0m
[32m[2018-07-20 14:51:59][INFO] [37m[allnodes | /tmp] echo -n '0' | tee /sys/devices/system/cpu/cpu32/online /sys/devices/system/cpu/cpu34/online /sys/devices/system/cpu/cpu36/online /sys/devices/system/cpu/cpu38/online /sys/devices/system/cpu/cpu40/online /sys/devices/system/cpu/cpu42/online /sys/devices/system/cpu/cpu44/online /sys/devices/system/cpu/cpu46/online /sys/devices/system/cpu/cpu48/online /sys/devices/system/cpu/cpu50/online /sys/devices/system/cpu/cpu52/online /sys/devices/system/cpu/cpu54/online /sys/devices/system/cpu/cpu56/online /sys/devices/system/cpu/cpu58/online /sys/devices/system/cpu/cpu60/online /sys/devices/system/cpu/cpu62/online /sys/devices/system/cpu/cpu33/online /sys/devices/system/cpu/cpu35/online /sys/devices/system/cpu/cpu37/online /sys/devices/system/cpu/cpu39/online /sys/devices/system/cpu/cpu41/online /sys/devices/system/cpu/cpu43/online /sys/devices

In [6]:
for job in alljobs:
    gflops = [str(estimate_peak(job)) for _ in range(10)]
    print('\n'.join(gflops))

[32m[2018-07-20 14:52:10][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:10][INFO] [37m[allnodes | /tmp] wget https://raw.githubusercontent.com/Ezibenroc/m2_internship_scripts/master/cblas_tests/dgemm_test.c[0m
[32m[2018-07-20 14:52:11][INFO] [37m[allnodes | /tmp] LD_LIBRARY_PATH=/tmp/lib gcc -DUSE_OPENBLAS ./dgemm_test.c -fopenmp -I /tmp/include                 /tmp/lib/libopenblas.so -O3 -o ./dgemm_test[0m
[32m[2018-07-20 14:52:11][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:15][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:19][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:23][INFO] [37m[al

3433.675135260615
3450.571164098867
3478.874792167511
3456.497521208855
3500.4783371049803
3462.8572984026814
3495.8838839977766
3458.973047607286
3474.45121580165
3467.224895329025


[32m[2018-07-20 14:52:50][INFO] [37m[allnodes | /tmp] wget https://raw.githubusercontent.com/Ezibenroc/m2_internship_scripts/master/cblas_tests/dgemm_test.c[0m
[32m[2018-07-20 14:52:50][INFO] [37m[allnodes | /tmp] LD_LIBRARY_PATH=/tmp/lib gcc -DUSE_OPENBLAS ./dgemm_test.c -fopenmp -I /tmp/include                 /tmp/lib/libopenblas.so -O3 -o ./dgemm_test[0m
[32m[2018-07-20 14:52:50][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:54][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:52:58][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:53:02][INFO] [37m[allnodes | /tmp] OMP_NUM_THREADS=32 LD_LIBRARY_PATH=/tmp/lib ./dgemm_test 8192 8192 8192 8192 8192 8192 [0m
[32m[2018-07-20 14:53:05][INFO] [37m[al

3478.236121485156
3478.172460622556
3461.808771626366
3511.56923152929
3499.183366656925
3503.427153995062
3467.233848417146
3511.484980843
3476.5503826352915
3460.5699661529443


## First (small) HPL run

In [7]:
for job in alljobs:
    send_key(job)
    %time time, gflops, output = run(job, size=2**14, block_size=512, proc_p=2, proc_q=2, bcast=3, pfact=2, rfact=2, depth=1)
    print(time, gflops)

[32m[2018-07-20 14:54:17][INFO] [37m[director | /root] ssh-keygen -b 2048 -t rsa -f .ssh/id_rsa -q -N ""[0m
[32m[2018-07-20 14:54:18][INFO] [37m[director] get: /root/.ssh/id_rsa.pub → /home/tom/Dropbox/Documents/Fac/phd/mpi_calibration/tmpzgk8lvc7[0m
[32m[2018-07-20 14:54:18][INFO] [37m[orchestra] put: /home/tom/Dropbox/Documents/Fac/phd/mpi_calibration/tmpzgk8lvc7 → /tmp/id_rsa.pub[0m
[32m[2018-07-20 14:54:19][INFO] [37m[orchestra | /root] cat /tmp/id_rsa.pub >> .ssh/authorized_keys[0m
[32m[2018-07-20 14:54:19][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" dahu-28.grenoble.grid5000.fr hostname[0m
[32m[2018-07-20 14:54:19][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" dahu-28 hostname[0m
[32m[2018-07-20 14:54:19][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" dahu-29.grenoble.grid5000.fr hostname[0m
[32m[2018-07-20 14:54:20][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" dahu-29 hostname[0m
[32m[

CPU times: user 260 ms, sys: 62.5 ms, total: 322 ms
Wall time: 9.62 s
4.08 718.5


[32m[2018-07-20 14:54:31][INFO] [37m[orchestra] put: /home/tom/Dropbox/Documents/Fac/phd/mpi_calibration/tmp4hg3lxiq → /tmp/id_rsa.pub[0m
[32m[2018-07-20 14:54:31][INFO] [37m[orchestra | /root] cat /tmp/id_rsa.pub >> .ssh/authorized_keys[0m
[32m[2018-07-20 14:54:31][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-6.nancy.grid5000.fr hostname[0m
[32m[2018-07-20 14:54:31][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-6 hostname[0m
[32m[2018-07-20 14:54:32][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-7.nancy.grid5000.fr hostname[0m
[32m[2018-07-20 14:54:32][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-7 hostname[0m
[32m[2018-07-20 14:54:32][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-8.nancy.grid5000.fr hostname[0m
[32m[2018-07-20 14:54:33][INFO] [37m[director | /root] ssh -o "StrictHostKeyChecking no" grvingt-8 hostname[0m
[32m[2018-07-2

CPU times: user 251 ms, sys: 56.2 ms, total: 307 ms
Wall time: 9.62 s
8.14 360.0


## Investigation

We saw that both the `dahu` and the `grvingt` nodes have roughly the same performance when running `dgemm`. However, the `grvingt` nodes are about two times slower when running HPL.

In [10]:
results = {}
%time results['dahu'] = run(dahu, size=2**15, block_size=128, proc_p=2, proc_q=2, bcast=3, pfact=2, rfact=2, depth=1)
%time results['gr20'] = run(gr20, size=2**15, block_size=128, proc_p=2, proc_q=2, bcast=3, pfact=2, rfact=2, depth=1)

print('dahu', results['dahu'][:-1])
print('gr20', results['gr20'][:-1])

[32m[2018-07-20 15:16:54][INFO] [37m[allnodes] put: /home/tom/Dropbox/Documents/Fac/phd/mpi_calibration/tmpvqoo8lxf → /tmp/hpl-2.2/bin/Debian/HPL.dat[0m
[32m[2018-07-20 15:16:54][INFO] [37m[director | /tmp/hpl-2.2/bin/Debian] mpirun --allow-run-as-root --bind-to none --timestamp-output -np 4 -x OMP_NUM_THREADS=32 -H dahu-27.grenoble.grid5000.fr,dahu-28.grenoble.grid5000.fr,dahu-29.grenoble.grid5000.fr,dahu-8.grenoble.grid5000.fr -x LD_LIBRARY_PATH=/tmp/lib ./xhpl[0m
[32m[2018-07-20 15:17:30][INFO] [37m[allnodes] put: /home/tom/Dropbox/Documents/Fac/phd/mpi_calibration/tmpteh0y0ez → /tmp/hpl-2.2/bin/Debian/HPL.dat[0m


CPU times: user 830 ms, sys: 190 ms, total: 1.02 s
Wall time: 36.5 s


[32m[2018-07-20 15:17:31][INFO] [37m[director | /tmp/hpl-2.2/bin/Debian] mpirun --allow-run-as-root --bind-to none --timestamp-output -np 4 -x OMP_NUM_THREADS=32 -H grvingt-5.nancy.grid5000.fr,grvingt-6.nancy.grid5000.fr,grvingt-7.nancy.grid5000.fr,grvingt-8.nancy.grid5000.fr -x LD_LIBRARY_PATH=/tmp/lib ./xhpl[0m


CPU times: user 810 ms, sys: 159 ms, total: 968 ms
Wall time: 35 s
dahu (20.28, 1157.0)
gr20 (39.82, 589.1)


Alright, this is becoming really weird. The execution for each of the jobs took about 35 seconds from **my laptop side**. However, on **HPL side**, the time reported is 39.82 seconds, which is significantly larger (i.e. certainly not due to a clock drift).

In [12]:
print(results['dahu'][-1].stdout)

Fri Jul 20 15:16:55 2018<stdout>:HPLinpack 2.2  --  High-Performance Linpack benchmark  --   February 24, 2016
Fri Jul 20 15:16:55 2018<stdout>:Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
Fri Jul 20 15:16:55 2018<stdout>:Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
Fri Jul 20 15:16:55 2018<stdout>:Modified by Julien Langou, University of Colorado Denver
Fri Jul 20 15:16:55 2018<stdout>:
Fri Jul 20 15:16:55 2018<stdout>:An explanation of the input/output parameters follows:
Fri Jul 20 15:16:55 2018<stdout>:T/V    : Wall time / encoded variant.
Fri Jul 20 15:16:55 2018<stdout>:N      : The order of the coefficient matrix A.
Fri Jul 20 15:16:55 2018<stdout>:NB     : The partitioning blocking factor.
Fri Jul 20 15:16:55 2018<stdout>:P      : The number of process rows.
Fri Jul 20 15:16:55 2018<stdout>:Q      : The number of process columns.
Fri Jul 20 15:16:55 2018<stdout>:Time   : Time in seconds to solve the linear system.
Fri Jul 

In [13]:
print(results['gr20'][-1].stdout)

Fri Jul 20 15:17:31 2018<stdout>:HPLinpack 2.2  --  High-Performance Linpack benchmark  --   February 24, 2016
Fri Jul 20 15:17:31 2018<stdout>:Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
Fri Jul 20 15:17:31 2018<stdout>:Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
Fri Jul 20 15:17:31 2018<stdout>:Modified by Julien Langou, University of Colorado Denver
Fri Jul 20 15:17:31 2018<stdout>:
Fri Jul 20 15:17:31 2018<stdout>:An explanation of the input/output parameters follows:
Fri Jul 20 15:17:31 2018<stdout>:T/V    : Wall time / encoded variant.
Fri Jul 20 15:17:31 2018<stdout>:N      : The order of the coefficient matrix A.
Fri Jul 20 15:17:31 2018<stdout>:NB     : The partitioning blocking factor.
Fri Jul 20 15:17:31 2018<stdout>:P      : The number of process rows.
Fri Jul 20 15:17:31 2018<stdout>:Q      : The number of process columns.
Fri Jul 20 15:17:31 2018<stdout>:Time   : Time in seconds to solve the linear system.
Fri Jul 

Observing HPL output:
- On `dahu`, the program starts at `15:16:55`, the factorization ends at `15:17:21`, hence an upper bound of 26 seconds. Also, `HPL_pdgesv()` lasts for about 20 seconds. The reported time for the factorization alone is 20.28 seconds.
- On `gr20`, the program starts at `15:17:31`, the factorization ends at `15:17:56`, hence an upper bound of 25 seconds. Also, `HPL_pdgesv()` lasts for about 19 seconds. The reported time for the factorization alone is 39.82 seconds.

There is clearly a contradiction on `grvingt` cluster. The time (and thus the gflops) reported by HPL are wrong here.

## Platform information

In [14]:
dahu.platform_information()

[32m[2018-07-20 15:30:08][INFO] [37m[allnodes | /tmp] uname -r[0m
[32m[2018-07-20 15:30:08][INFO] [37m[allnodes | /tmp] cat /proc/version[0m
[32m[2018-07-20 15:30:08][INFO] [37m[allnodes | /tmp] gcc -dumpversion[0m
[32m[2018-07-20 15:30:09][INFO] [37m[allnodes | /tmp] mpirun --version | head -n 1[0m
[32m[2018-07-20 15:30:09][INFO] [37m[allnodes | /tmp] cat /proc/cpuinfo  | grep "name"| uniq | cut -d: -f2 [0m
[32m[2018-07-20 15:30:09][INFO] [37m[allnodes | /tmp] arp -a[0m


{'dahu-27.grenoble.grid5000.fr': {'kernel': '4.9.0-6-amd64',
  'version': 'Linux version 4.9.0-6-amd64 (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18+deb9u1) ) #1 SMP Debian 4.9.88-1+deb9u1 (2018-05-07)',
  'gcc': '6.3.0',
  'mpi': 'mpirun (Open MPI) 2.0.2',
  'cpu': 'Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz',
  'arp': {'dahu-8.grenoble.grid5000.fr': ['(172.16.20.8) at 3c:fd:fe:54:a6:58 [ether] on enp24s0f0'],
   'dahu-29.grenoble.grid5000.fr': ['(172.16.20.29) at 3c:fd:fe:55:dd:c0 [ether] on enp24s0f0'],
   'kadeploy.grenoble.grid5000.fr': ['(172.16.31.102) at 00:16:3e:c8:38:51 [ether] on enp24s0f0'],
   'fgrenoble.grenoble.grid5000.fr': ['(172.16.31.101) at 00:16:3e:06:dd:61 [ether] on enp24s0f0'],
   'dahu-28.grenoble.grid5000.fr': ['(172.16.20.28) at 3c:fd:fe:54:2a:e0 [ether] on enp24s0f0'],
   'dns.grenoble.grid5000.fr': ['(172.16.31.110) at 00:16:3e:4f:65:bf [ether] on enp24s0f0'],
   'gw.grenoble.grid5000.fr': ['(172.16.31.254) at 00:04:96:1d:09:40

In [15]:
gr20.platform_information()

[32m[2018-07-20 15:30:14][INFO] [37m[allnodes | /tmp] uname -r[0m
[32m[2018-07-20 15:30:14][INFO] [37m[allnodes | /tmp] cat /proc/version[0m
[32m[2018-07-20 15:30:14][INFO] [37m[allnodes | /tmp] gcc -dumpversion[0m
[32m[2018-07-20 15:30:14][INFO] [37m[allnodes | /tmp] mpirun --version | head -n 1[0m
[32m[2018-07-20 15:30:14][INFO] [37m[allnodes | /tmp] cat /proc/cpuinfo  | grep "name"| uniq | cut -d: -f2 [0m
[32m[2018-07-20 15:30:15][INFO] [37m[allnodes | /tmp] arp -a[0m


{'grvingt-5.nancy.grid5000.fr': {'kernel': '4.9.0-6-amd64',
  'version': 'Linux version 4.9.0-6-amd64 (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18+deb9u1) ) #1 SMP Debian 4.9.88-1+deb9u1 (2018-05-07)',
  'gcc': '6.3.0',
  'mpi': 'mpirun (Open MPI) 2.0.2',
  'cpu': 'Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz',
  'arp': {'fnancy.nancy.grid5000.fr': ['(172.16.79.101) at 00:16:3e:6a:bb:1b [ether] on enp24s0f0'],
   'kadeploy.nancy.grid5000.fr': ['(172.16.79.102) at 00:16:3e:65:d3:82 [ether] on enp24s0f0'],
   'grvingt-7.nancy.grid5000.fr': ['(172.16.76.7) at 3c:fd:fe:57:a7:38 [ether] on enp24s0f0'],
   'grvingt-6.nancy.grid5000.fr': ['(172.16.76.6) at 3c:fd:fe:57:a6:d8 [ether] on enp24s0f0'],
   'sgravillon2.nancy.grid5000.fr': ['(172.16.79.254) at e8:65:49:cc:e6:7f [ether] on enp24s0f0'],
   'grvingt-8.nancy.grid5000.fr': ['(172.16.76.8) at 3c:fd:fe:57:72:98 [ether] on enp24s0f0'],
   'dns.nancy.grid5000.fr': ['(172.16.79.106) at 00:16:3e:30:fe:c0 [ether] on