# Reordering matrix option

### Description 
Test of a new 1.9.1 option: reordering matrix with a RCM algorithm, in order to reduce matrix bandwith and possibly improve computation intensity (in sequential) and reduce MPI communications (in parallel).

The test case is a 3D flow into a channel with the P0P1 discretization (pressure unknowns on cells AND nodes).

Then, the test case is run on the P0 discretization (pressure unknowns on cells only) and P1 discretization (pressure unknowns on nodes only).

The 3 tests cases are using a GC/SSOR solver (relaxation 1.6) from the PETSc library.

In [None]:
from trustutils import run
run.TRUST_parameters()
run.introduction('Pierre LEDAC (CEA/DES/ISAS/DM2S/STMF/LGLS)')
# Creation des jeux de donnees
NP=1
MESH="7 7 4"  # 34 560 cells
cases=[
       ("no_ordering"     ,"No ordering for sbaij matrix (default)"   ,"reorder_matrix 0"),
       ("no_ordering_aij" ,"No ordering for aij matrix"               ,"aij reorder_matrix 0"),
       ("ordering"        ,"Ordering for aij matrix"               ,"aij reorder_matrix 1"),
      ]
run.reset()
run.initCaseSuite()
for case,label,syntax in cases:
    # Create test case:
    run.executeCommand("cas=%s;mkdir -p $cas;cd $cas;cp ../base.data $cas.data;ln -s -f ../post_run ." % case, verbose=False)
    cas = run.addCase(case,"%s.data" % case)
    cas.substitute("_reorder_matrix_",syntax)
    cas.substitute("_MESH_",MESH)
    # Create a parallel test case:
    if NP>1:
        run.executeCommand("cas=%s;cd $cas;make_PAR.data $cas %s;exit 0" % (case,NP), verbose=False)
        cas = run.addCase(case,"PAR_%s.data" % case, NP)
    
run.printCases()

In [None]:
run.runCases()

# Matrix sparsity (P0P1)

In [None]:
def plot_matrix(str):
    from PIL import Image
    from matplotlib.pyplot import figure
    import matplotlib.pyplot as plt
    import re
    fig, ax = plt.subplots(1,len(cases))
    for i,tuple in enumerate(cases):
        case = tuple[0]
        label = tuple[1]
        dir = run.BUILD_DIRECTORY+"/"+case
        for line in open(dir+"/"+case+".out"):
            for match in re.finditer(re.compile("PETSc"), line):
                rows = line.split(" ")[6]
        ppm = Image.open(dir+"/matrix/matrix_0.ppm")
        run.saveFileAccumulator(case+"/matrix/matrix_0.ppm")
        ax[i].axis('off')
        ax[i].set_title(label+"\n"+str+":"+rows+" rows")
        ax[i].imshow(ppm)
    fig.set_size_inches(18.5, 10.5)
plot_matrix("P0P1")    

# Convergence (P0P1)

In [None]:
from trustutils import plot
    
a = plot.Graph("Relative residual ||Ax(it)-b||/||Ax(0)-b|| during the fist time step:","",1,1,[10,5])

for case,label,syntax in cases:
    cols = plot.loadText(case+"/%s.res" % case)
    a.add(cols[0],cols[1],label="%s" % label, marker='o')
    if NP>1:
        cols = plot.loadText(case+"/PAR_%s.res" % case)
        a.add(cols[0],cols[1],label="%s (%s MPI cores)" % (label,NP), marker='o')

a.label("Iteration","Residual")
a.subplot.set_yscale('log')

Ordering the matrix may improve slightly the convergence.

# Memory used (P0P1)

In [None]:
def plot_memory():
    a = plot.Graph("Max RAM per core used during calculation:","",1,1,[10,5])
    for case,label,syntax in cases:
        cols = plot.loadText(case+"/%s.ram" % case)
        a.add(cols[0],cols[1],label="%s" % label)
        if NP>1:
            cols = plot.loadText(case+"/PAR_%s.ram" % case)
            a.add(cols[0],cols[1],label="%s (%s MPI cores)" % (label,NP), marker='-o')
    a.label("Sample","RAM [MB]")
plot_memory()    

# CPU time (P0P1)

In [None]:
def cpu_time():
    a = plot.Graph("CPU time of pressure solve during calculation:","",1,1,[10,5])
    for case,label,syntax in cases:
        cols = plot.loadText(case+"/%s.cpu" % case)
        a.add(cols[0],cols[1],label="%s" % label)
        if NP>1:
            cols = plot.loadText(case+"/PAR_%s.cpu" % case)
            a.add(cols[0],cols[1],label="%s (%s MPI cores)" % (label,NP), marker='-o')
    a.label("Time step","CPU [s]")
cpu_time()

No gain for matrix ordering strategy on this P0P1 matrix (the default in VEF).

# Sparsity and run time for a P0 matrix

In [None]:
for cas in run.getCases():
    cas.substitute("VEFPreP1B dis","VEFPreP1B dis Lire dis { P0 }")
run.runCases()
plot_matrix("P0")

In [None]:
cpu_time()

# Sparsity and run time for a P1 matrix

In [None]:
for cas in run.getCases():
    cas.substitute("dis { P0 }","dis { P1 }")
run.runCases()
plot_matrix("P1")

In [None]:
cpu_time()

# Conclusion
Ordering on typical matrix size for TRUST (~30 000 rows/MPI rank) doesn't increase performance. Gain appears only if matrix size increases (~300 000 rows) when probably the matrix doesn't fit completly in L3 cache.