## Motivating example
We'll stick with Poisson from the previous notebook, but this time we'll implement more in-depth `Matvec`s.

Since the `gpu_matvec` and `cpu_matvec` routines just need to return callables that take a single function argument, we can use any backend we'd like!

In [None]:
import numpy as np

import pyopencl as cl
import pyopencl.array as cl_array

from discr_tools.discretization import Discretization
from discr_tools.matvecs import MatvecBase

class PyOpenCLMatvec(MatvecBase):
    def cpu_matvec(self):
        pass

    def gpu_matvec(self):
        # copy the vector u to the device and multiply it by 2
        # return the result as a numpy array (hint: use result.get())
        def matvec(u):
            u_d = cl_array.to_device(self._queue, u)
            return (...).get()
        
        return matvec


ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# create the discretization and matvec (hint: you need to pass queue to the queue kwarg)
discr = ...
mv = PyOpenCLMatvec(discr, queue=queue)
mv(discr.scatter(np.ones_like(discr.mapped_elements[0])))

## Other backends

This will be a quick walkthrough of a simpler, less optimized version of the `PoissonMatvec` GPU implementation in `discr_tools.matvecs`.

We'll use Loopy as a backend, which ultimately uses PyOpenCL to run on a GPU.

We define the kernel as follows:

In [None]:
import loopy as lp

order = 3
a, b = -1, 1
dim = 2
nelts_1d = 16

discr = Discretization(order, a, b, dim, nelts_1d)
_, nelts, npts = discr.mapped_elements.shape

knl = lp.make_kernel(
    "{[e,i,j,l] : 0 <= e < ne and 0 <= i, j, l < np }",
    """
    ur(e, i) := sum([l], dr[i,l] * u[e,l])
    us(e, i) := sum([l], ds[i,l] * u[e,l])

    drdx(e, i) := g[0,0,e,i]
    dsdx(e, i) := g[0,1,e,i]
    
    drdy(e, i) := g[1,0,e,i]
    dsdy(e, i) := g[1,1,e,i]

    ux(e, i) := ur(e, i) * drdx(e, i) + us(e, i) * dsdx(e, i)
    uy(e, i) := ur(e, i) * drdy(e, i) + us(e, i) * dsdy(e, i)

    uxx(e, i) := sum([j], dr[j, i] * ux(e, j))
    uyy(e, i) := sum([j], ds[j, i] * uy(e, j))

    lap_u[e, i] = uxx(e, i) + uyy(e, i)
    """,
    [
        lp.GlobalArg("u", shape=(nelts, npts)),
        lp.GlobalArg("g", shape=(dim, dim, nelts, npts)),
        lp.GlobalArg("dr", shape=(npts, npts)),
        lp.GlobalArg("ds", shape=(npts, npts)),
        lp.GlobalArg("lap_u", shape=(nelts, npts), is_output=True)
        
    ]
)

As for the ingredients:
1. We need a domain for each "iname" (i.e. loop variable names), this is specified using "ISL syntax"
    - The Integer Set Library (ISL) is a framework used for implementing the polyhedral model
2. Each line possessing a `:=` is a substitution rule
3. Anything being accessed with brackets `[]` is assumed to be an array
4. Reductions are supported by Loopy, and the keyword sum reduces over a particular iname

Now, we'll need to fix some parameters, specify how to parallelize, and we can 
use this in our `Matvec` implementation!

In [None]:
_, nelts, npts = discr.mapped_elements.shape
knl = lp.fix_parameters(knl, ne=nelts, np=npts)

knl = lp.tag_inames(
    knl,
    {
        "e": "g.0",
        "i": "l.0"
    }
)

In [None]:
from discr_tools.geometry import jacobian_determinant, inverse_jacobian_t
import sympy as sp
import scipy.sparse.linalg as spla


class MyGPUPoissonMatvec(MatvecBase):
    def gpu_matvec(self):
        discr = self.discr

        d = discr.operators.diff_operator
        eye = np.eye(d.shape[0])
        dr = np.kron(d, eye)
        ds = np.kron(eye, d)

        from discr_tools.geometry import inverse_jacobian_t, jacobian_determinant
        det_j = jacobian_determinant(discr.mapped_elements, discr.basis_cls)
        inv_j_t = inverse_jacobian_t(discr.mapped_elements, discr.basis_cls)
        
        d = discr.operators.diff_operator

        eye = np.eye(discr.order+1)
        dr = np.kron(d, eye)
        ds = np.kron(eye, d)
        d = np.array([dr, ds])
        
        wts = discr.basis_cls.weights
        wts_2d = np.kron(wts, wts)
        
        g = np.einsum("kiep,kjep,ep,p->ijep", inv_j_t, inv_j_t, det_j, wts_2d)
        
        def matvec(u):
            # snag the kernel defined above
            evt, out = knl(self._queue, u=u, dr=dr, ds=ds, g=g.copy())
            return out[0]

        return matvec

    def cpu_matvec(self):
        pass


x, y = discr.mapped_elements
dim, nelts, npts = discr.mapped_elements.shape

x_sp = sp.symbols('x0 x1')

u_expr = 1.
for i in range(dim):
    u_expr *= sp.sin(sp.pi*x_sp[i])
u_lambda = sp.lambdify(x_sp, u_expr)

lap_u_expr = 0.
for i in range(dim):
    lap_u_expr += u_expr.diff(x_sp[0], 2)
lap_u_expr = -lap_u_expr
lap_u_lambda = sp.lambdify(x_sp, lap_u_expr)

rhs = lap_u_lambda(x, y)
det_j = jacobian_determinant(discr.mapped_elements, discr.basis_cls)

wts = discr.basis_cls.weights
wts_2d = np.kron(wts, wts)

f = discr.scatter(discr.apply_mask(det_j * rhs * wts_2d))

mv = MyGPUPoissonMatvec(discr, queue=queue)
lin_op = spla.LinearOperator(f.shape*2, mv)

out, _ = spla.cg(lin_op, f)
out = discr.gather(out)

u_true = u_lambda(x, y)

abs_err = abs(out - u_true)
l2_abs = np.sqrt(np.sum(abs_err**2 * det_j * wts_2d))
l2_u = np.sqrt(np.sum(u_true**2 * det_j * wts_2d))
l2_err = l2_abs / l2_u
print(f"Error: {l2_err}")