In [None]:
%load_ext autoreload
%autoreload 2
from notebook import *
from CSE142L.notebook import *
from cfiddle import *
%xmode minimal
# if you get something about NUMEXPR_MAX_THREADS being set incorrectly, don't worry.  It's not a problem.
# if get something about NUMEXPR_MAX_THREADS being set incorrectly, don't worry.  It's not a problem.

#KEY include Namebox.ipynb

<div style=" font-size: 300% !important;
    margin-top: 1.5em;
    margin-bottom: 1.5em;
    font-weight: bold;
    line-height: 1.0;
    text-align:center;">Lab 5: Parallelism -- Demos</div>


In [None]:
!cse142 job run --lab parallel --take NOTHING "hostname"


In [None]:
t = build(code(r"""
void foo() {
    int i = 0;
    if (i)
        return;
}
"""))

In [None]:
while True:
    try:
        run(t, "foo")
    except Exception:
        pass

# Branch Prediction

In [None]:
#KEY mutable
!make clean
t = fiddle("branch.cpp", function="branchy", opt="-O3",
code=r"""
#include"function_map.hpp"
#include"archlab.hpp"
#include<cstdint>

extern "C"
uint64_t __attribute__((noinline)) go(uint64_t * data, uint64_t size, uint64_t threshold) {
    uint64_t sum = 0;
    for (unsigned i = 0; i < size; ++i) {
        if (data[i] >= threshold){
            sum++;
            data[i]--;
        } else {
            data[i]++;
        }
    }
    return sum;
}

extern "C"
uint64_t* branchy(uint64_t threads, uint64_t * data, uint64_t size, uint64_t arg1, uint64_t arg2, uint64_t arg3) {

    uint64_t seed = 0xDEADBEEF;
    uint64_t sum = 0;
    for (unsigned i = 0; i < size; ++i)
        data[i] = fast_rand(&seed);

    if(arg1) {
        std::sort(data, data + size);
    }
   
    for(unsigned k = 0; k < arg2; k++){
        uint64_t threshold = fast_rand(&seed);
        sum += go(data, size, threshold);
    }
    return data + sum;
}
FUNCTION(one_array_2arg, branchy);
""", run=["perf_count"], 
           cmdline=f"--size 10000000 --arg1 1 0 --arg2 100", 
           perf_cmdline="--stat-set BPred.cfg --MHz 3500")



In [None]:
do_cfg("build/branch.so", symbol="go")

In [None]:
df = render_csv("build/branch.csv")
df["sort"] = df["arg1"].apply(lambda x: "sorted" if x else "unsorted")
display(df[["sort", "IC", "CPI", "CT", "ET", "bpred_miss_rate"]])
plotPEBar(df=df, what=[("sort", "CPI"),("sort", "ET"), ("sort", "bpred_miss_rate")])


# OpenMP Assembly

In [None]:
render_code("matexp_solution.hpp", show=("//START", "//END"))

In [None]:
!make clean
!make build/matexp.s
!make matexp.exe

In [None]:
!c++filt < build/matexp.s | grep '^void copy_matrix'

In [None]:
render_code("build/matexp.s", show="void copy_matrix<unsigned long>(tensor_t<unsigned long>&, tensor_t<unsigned long> const&)", lang="gas")

In [None]:
render_code("build/matexp.s", show="void copy_matrix<unsigned long>(tensor_t<unsigned long>&, tensor_t<unsigned long> const&) [clone ._omp_fn.0]", lang="gas")

# NUMA

This demo won't run for students.  It needs to be running directly on a bare metal machine.

In [None]:
#KEY delete cell
#KEY mutable
numa_demo = build(code(r"""
#include"cfiddle.hpp"
#include<cstdint>
#include<iostream>
#include<thread>
#include<mutex>
#include"threads.hpp"
#include"pthread.h"

std::mutex lock;
volatile int shared = 0;
void go(uint64_t id,int count) {
    int last = 0;
    for(int i= 0; i < count; i++){
        lock.lock();
        if (shared != last) {
            shared++;
            last = shared;
        }
        lock.unlock();
    }
}

extern "C"
void numa_demo(uint64_t count, uint64_t core_A, uint64_t core_B) {
    shared = 0;
    start_measurement();
    std::thread other (go, 1, count);
    bind_to_core(other, core_A);

    bind_to_core(pthread_self(), core_B);
    go(0, count);
    other.join();
    end_measurement();
}
"""), arg_map(OPTIMIZE="-O3"))


In [None]:
with local_execution():
    numa_data = run(numa_demo, "numa_demo", arg_map(count=1000000, core_A=0, core_B=range(0,16)))

In [None]:
plotPEBar(df=numa_data.as_df(), what=[("core_B", "ET")])

# Lab Performance

## Our Machine

In [None]:
login("swanson@eng.ucsd.edu")

In [None]:
token("")

In [None]:
!make clean
!make matexp.exe

In [None]:
!cse142 job run --lab caches2 "./matexp.exe --MHz 3500 --stat-set  ./L1.cfg --stats mat_mul.csv  --thread 1 2 3 4 5 6 12 --function bench_solution  --p1 1 --p2 1 --p3 1  --p4 1 --p5 1"

In [None]:
mat_mul=render_csv("mat_mul.csv")
#display(mat_mul[["power", "size"]])
mat_mul["label"] = mat_mul["power"].astype(str) + "-" + mat_mul["size"].astype(str)

big = mat_mul[mat_mul["label"] == "2-600"].copy()
medium = mat_mul[mat_mul["label"] == "25-350"].copy()
small = mat_mul[mat_mul["label"] == "320-120"].copy()

small["speedup"] = small.iloc[0]["ET"]/small["ET"]
medium["speedup"] = medium.iloc[0]["ET"]/medium["ET"]
big["speedup"] = big.iloc[0]["ET"]/big["ET"]
plotPE(df=big, lines=True, what=[("thread", "speedup")])
plotPE(df=medium, lines=True, what=[("thread", "speedup")])
plotPE(df=small, lines=True, what=[("thread", "speedup")])

## Big Machine

### Just run the same code

In [None]:
#KEY mutable
!make clean
!make matexp.exe
!./matexp.exe --MHz 3500 --stat-set  ./L1.cfg --stats mat_mul.csv  --thread 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 48 --function bench_solution  --p1 1 --p2 1 --p3 1  --p4 1 --p5 1

In [None]:
mat_mul=render_csv("mat_mul.csv")
#display(mat_mul[["power", "size"]])
mat_mul["label"] = mat_mul["power"].astype(str) + "-" + mat_mul["size"].astype(str)

big = mat_mul[mat_mul["label"] == "2-600"].copy()
medium = mat_mul[mat_mul["label"] == "25-350"].copy()
small = mat_mul[mat_mul["label"] == "320-120"].copy()

small["speedup"] = small.iloc[0]["ET"]/small["ET"]
medium["speedup"] = medium.iloc[0]["ET"]/medium["ET"]
big["speedup"] = big.iloc[0]["ET"]/big["ET"]
plotPE(df=big, lines=True, what=[("thread", "speedup")])
plotPE(df=medium, lines=True, what=[("thread", "speedup")])
plotPE(df=small, lines=True, what=[("thread", "speedup")])

### Break up the outer loop into smaller pieces

In [None]:
 
!./matexp.exe --MHz 3500 --stat-set  ./L1.cfg --stats mat_mul3.csv  --thread 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 48 --function bench_solution  --p1 1 --p2 13 --p3 1  --p4 1 --p5 1

In [None]:
mat_mul=render_csv("mat_mul3.csv")
#display(mat_mul[["power", "size"]])
mat_mul["label"] = mat_mul["power"].astype(str) + "-" + mat_mul["size"].astype(str)

big = mat_mul[mat_mul["label"] == "2-600"].copy()
medium = mat_mul[mat_mul["label"] == "25-350"].copy()
small = mat_mul[mat_mul["label"] == "320-120"].copy()

small["speedup"] = small.iloc[0]["ET"]/small["ET"]
medium["speedup"] = medium.iloc[0]["ET"]/medium["ET"]
big["speedup"] = big.iloc[0]["ET"]/big["ET"]
plotPE(df=big, lines=True, what=[("thread", "speedup")])
plotPE(df=medium, lines=True, what=[("thread", "speedup")])
plotPE(df=small, lines=True, what=[("thread", "speedup")])

## STOP HERE


In [None]:
#KEY mutable
#!make clean
!make matexp.exe
!./matexp.exe --MHz 3500 --stat-set  ./L1.cfg --stats mat_mul_p2.csv  --thread 6 --function bench_solution  --p1 1 --p2 1 2 3 4 5 6 7 8 9 10 11 12 14 16 18 20 22 24 --p3 1  --p4 1 --p5 1

In [None]:
mat_mul=render_csv("mat_mul_p2.csv")
#display(mat_mul[["power", "size"]])
mat_mul["label"] = mat_mul["power"].astype(str) + "-" + mat_mul["size"].astype(str)

big = mat_mul[mat_mul["label"] == "2-600"].copy()
medium = mat_mul[mat_mul["label"] == "25-350"].copy()
small = mat_mul[mat_mul["label"] == "320-120"].copy()

small["speedup"] = small.iloc[0]["ET"]/small["ET"]
medium["speedup"] = medium.iloc[0]["ET"]/medium["ET"]
big["speedup"] = big.iloc[0]["ET"]/big["ET"]
plotPE(df=big, lines=True, what=[("p2", "speedup")])
plotPE(df=medium, lines=True, what=[("p2", "speedup")])
plotPE(df=small, lines=True, what=[("p2", "speedup")])


# Move ILP demos here?