Skip to content

Commit

Permalink
drm/v3d: Add support for compute shader dispatch.
Browse files Browse the repository at this point in the history
The compute shader dispatch interface is pretty simple -- just pass in
the regs that userspace has passed us, with no CLs to run.  However,
with no CL to run it means that we need to do manual cache flushing of
the L2 after the HW execution completes (for SSBO, atomic, and
image_load_store writes that are the output of compute shaders).

This doesn't yet expose the L2 cache's ability to have a region of the
address space not write back to memory (which could be used for
shared_var storage).

So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing
the ES31 tests), and on the kernel side on 7278 (failing atomic
compswap tests in a way that doesn't reproduce on simpenrose).

Signed-off-by: Eric Anholt <eric@anholt.net>
  • Loading branch information
anholt committed Apr 1, 2019
1 parent 72045f4 commit 58e06f6
Show file tree
Hide file tree
Showing 10 changed files with 530 additions and 19 deletions.
22 changes: 22 additions & 0 deletions drivers/gpu/drm/v3d/v3d_debugfs.c
Expand Up @@ -58,6 +58,17 @@ static const struct v3d_reg_def v3d_core_reg_defs[] = {
REGDEF(V3D_GMP_VIO_ADDR),
};

static const struct v3d_reg_def v3d_csd_reg_defs[] = {
REGDEF(V3D_CSD_STATUS),
REGDEF(V3D_CSD_CURRENT_CFG0),
REGDEF(V3D_CSD_CURRENT_CFG1),
REGDEF(V3D_CSD_CURRENT_CFG2),
REGDEF(V3D_CSD_CURRENT_CFG3),
REGDEF(V3D_CSD_CURRENT_CFG4),
REGDEF(V3D_CSD_CURRENT_CFG5),
REGDEF(V3D_CSD_CURRENT_CFG6),
};

static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
{
struct drm_info_node *node = (struct drm_info_node *)m->private;
Expand Down Expand Up @@ -89,6 +100,17 @@ static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused)
V3D_CORE_READ(core,
v3d_core_reg_defs[i].reg));
}

if (v3d_has_csd(v3d)) {
for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) {
seq_printf(m, "core %d %s (0x%04x): 0x%08x\n",
core,
v3d_csd_reg_defs[i].name,
v3d_csd_reg_defs[i].reg,
V3D_CORE_READ(core,
v3d_csd_reg_defs[i].reg));
}
}
}

return 0;
Expand Down
10 changes: 7 additions & 3 deletions drivers/gpu/drm/v3d/v3d_drv.c
Expand Up @@ -7,9 +7,9 @@
* This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs.
* For V3D 2.x support, see the VC4 driver.
*
* Currently only single-core rendering using the binner and renderer,
* along with TFU (texture formatting unit) rendering is supported.
* V3D 4.x's CSD (compute shader dispatch) is not yet supported.
* The V3D GPU includes a tiled render (composed of a bin and render
* pipelines), the TFU (texture formatting unit), and the CSD (compute
* shader dispatch).
*/

#include <linux/clk.h>
Expand Down Expand Up @@ -193,6 +193,9 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
case DRM_V3D_PARAM_SUPPORTS_TFU:
args->value = 1;
return 0;
case DRM_V3D_PARAM_SUPPORTS_CSD:
args->value = v3d_has_csd(v3d);
return 0;
default:
DRM_DEBUG("Unknown parameter %d\n", args->param);
return -EINVAL;
Expand Down Expand Up @@ -252,6 +255,7 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
};

static int v3d_dumb_create(struct drm_file *file_priv,
Expand Down
28 changes: 27 additions & 1 deletion drivers/gpu/drm/v3d/v3d_drv.h
Expand Up @@ -17,9 +17,11 @@ enum v3d_queue {
V3D_BIN,
V3D_RENDER,
V3D_TFU,
V3D_CSD,
V3D_CACHE_CLEAN,
};

#define V3D_MAX_QUEUES (V3D_TFU + 1)
#define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1)

struct v3d_queue_state {
struct drm_gpu_scheduler sched;
Expand Down Expand Up @@ -76,6 +78,7 @@ struct v3d_dev {
struct v3d_bin_job *bin_job;
struct v3d_render_job *render_job;
struct v3d_tfu_job *tfu_job;
struct v3d_csd_job *csd_job;

struct v3d_queue_state queue[V3D_MAX_QUEUES];

Expand All @@ -98,6 +101,12 @@ struct v3d_dev {
*/
struct mutex sched_lock;

/* Lock taken during a cache clean and when initiating an L2
* flush, to keep L2 flushes from interfering with the
* synchronous L2 cleans.
*/
struct mutex cache_clean_lock;

struct {
u32 num_allocated;
u32 pages_allocated;
Expand All @@ -110,6 +119,12 @@ to_v3d_dev(struct drm_device *dev)
return (struct v3d_dev *)dev->dev_private;
}

static inline bool
v3d_has_csd(struct v3d_dev *v3d)
{
return v3d->ver >= 41;
}

/* The per-fd struct, which tracks the MMU mappings. */
struct v3d_file_priv {
struct v3d_dev *v3d;
Expand Down Expand Up @@ -228,6 +243,14 @@ struct v3d_tfu_job {
struct drm_v3d_submit_tfu args;
};

struct v3d_csd_job {
struct v3d_job base;

u32 timedout_batches;

struct drm_v3d_submit_csd args;
};

/**
* _wait_for - magic (register) wait macro
*
Expand Down Expand Up @@ -289,11 +312,14 @@ int v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int v3d_wait_bo_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
void v3d_job_put(struct v3d_job *job);
void v3d_reset(struct v3d_dev *v3d);
void v3d_invalidate_caches(struct v3d_dev *v3d);
void v3d_clean_caches(struct v3d_dev *v3d);

/* v3d_irq.c */
int v3d_irq_init(struct v3d_dev *v3d);
Expand Down
2 changes: 2 additions & 0 deletions drivers/gpu/drm/v3d/v3d_fence.c
Expand Up @@ -36,6 +36,8 @@ static const char *v3d_fence_get_timeline_name(struct dma_fence *fence)
return "v3d-render";
case V3D_TFU:
return "v3d-tfu";
case V3D_CSD:
return "v3d-csd";
default:
return NULL;
}
Expand Down
155 changes: 150 additions & 5 deletions drivers/gpu/drm/v3d/v3d_gem.c
Expand Up @@ -162,10 +162,52 @@ v3d_flush_l2t(struct v3d_dev *v3d, int core)
/* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't
* need to wait for completion before dispatching the job --
* L2T accesses will be stalled until the flush has completed.
* However, we do need to make sure we don't try to trigger a
* new flush while the L2_CLEAN queue is trying to
* synchronously clean after a job.
*/
mutex_lock(&v3d->cache_clean_lock);
V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
V3D_L2TCACTL_L2TFLS |
V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
mutex_unlock(&v3d->cache_clean_lock);
}

/* Cleans texture L1 and L2 cachelines (writing back dirty data).
*
* For cleaning, which happens from the CACHE_CLEAN queue after CSD has
* executed, we need to make sure that the clean is done before
* signaling job completion. So, we synchronously wait before
* returning, and we make sure that L2 invalidates don't happen in the
* meantime to confuse our are-we-done checks.
*/
void
v3d_clean_caches(struct v3d_dev *v3d)
{
struct drm_device *dev = &v3d->drm;
int core = 0;

trace_v3d_cache_clean_begin(dev);

V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
}

mutex_lock(&v3d->cache_clean_lock);
V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL,
V3D_L2TCACTL_L2TFLS |
V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM));

if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L2T clean\n");
}

mutex_unlock(&v3d->cache_clean_lock);

trace_v3d_cache_clean_end(dev);
}

/* Invalidates the slice caches. These are read-only caches. */
Expand Down Expand Up @@ -429,7 +471,8 @@ static void
v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv,
struct v3d_job *job,
struct ww_acquire_ctx *acquire_ctx,
u32 out_sync)
u32 out_sync,
struct dma_fence *done_fence)
{
struct drm_syncobj *sync_out;
int i;
Expand All @@ -445,7 +488,7 @@ v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv,
/* Update the return sync object for the job */
sync_out = drm_syncobj_find(file_priv, out_sync);
if (sync_out) {
drm_syncobj_replace_fence(sync_out, job->done_fence);
drm_syncobj_replace_fence(sync_out, done_fence);
drm_syncobj_put(sync_out);
}
}
Expand Down Expand Up @@ -541,8 +584,10 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
mutex_unlock(&v3d->sched_lock);

v3d_attach_fences_and_unlock_reservation(file_priv,
&render->base, &acquire_ctx,
args->out_sync);
&render->base,
&acquire_ctx,
args->out_sync,
render->base.done_fence);

if (bin)
v3d_job_put(&bin->base);
Expand Down Expand Up @@ -636,9 +681,107 @@ v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,

v3d_attach_fences_and_unlock_reservation(file_priv,
&job->base, &acquire_ctx,
args->out_sync);
args->out_sync,
job->base.done_fence);

v3d_job_put(&job->base);

return 0;

fail_unreserve:
mutex_unlock(&v3d->sched_lock);
drm_gem_unlock_reservations(job->base.bo, job->base.bo_count,
&acquire_ctx);
fail:
v3d_job_put(&job->base);

return ret;
}

/**
* v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D.
* @dev: DRM device
* @data: ioctl argument
* @file_priv: DRM file for this fd
*
* Userspace provides the register setup for the CSD, which we don't
* need to validate since the CSD is behind the MMU.
*/
int
v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
{
struct v3d_dev *v3d = to_v3d_dev(dev);
struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
struct drm_v3d_submit_csd *args = data;
struct v3d_csd_job *job;
struct v3d_job *clean_job;
struct ww_acquire_ctx acquire_ctx;
int ret;

trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]);

if (!v3d_has_csd(v3d)) {
DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n");
return -EINVAL;
}

job = kcalloc(1, sizeof(*job), GFP_KERNEL);
if (!job)
return -ENOMEM;

ret = v3d_job_init(v3d, file_priv, &job->base,
v3d_job_free, args->in_sync);
if (ret) {
kfree(job);
return ret;
}

clean_job = kcalloc(1, sizeof(*job), GFP_KERNEL);
if (!clean_job) {
v3d_job_put(&job->base);
kfree(job);
return -ENOMEM;
}

ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0);
if (ret) {
v3d_job_put(&job->base);
kfree(clean_job);
return ret;
}

job->args = *args;

ret = v3d_lookup_bos(dev, file_priv, &job->base,
args->bo_handles, args->bo_handle_count);
if (ret)
goto fail;

ret = v3d_lock_bo_reservations(job->base.bo, job->base.bo_count,
&acquire_ctx);
if (ret)
goto fail;

mutex_lock(&v3d->sched_lock);
ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
if (ret)
goto fail_unreserve;

clean_job->in_fence = dma_fence_get(job->base.done_fence);
ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
if (ret)
goto fail_unreserve;
mutex_unlock(&v3d->sched_lock);

v3d_attach_fences_and_unlock_reservation(file_priv,
&job->base,
&acquire_ctx,
args->out_sync,
clean_job->done_fence);

v3d_job_put(&job->base);
v3d_job_put(clean_job);

return 0;

Expand All @@ -648,6 +791,7 @@ v3d_submit_tfu_ioctl(struct drm_device *dev, void *data,
&acquire_ctx);
fail:
v3d_job_put(&job->base);
v3d_job_put(clean_job);

return ret;
}
Expand All @@ -667,6 +811,7 @@ v3d_gem_init(struct drm_device *dev)
mutex_init(&v3d->bo_lock);
mutex_init(&v3d->reset_lock);
mutex_init(&v3d->sched_lock);
mutex_init(&v3d->cache_clean_lock);

/* Note: We don't allocate address 0. Various bits of HW
* treat 0 as special, such as the occlusion query counters
Expand Down
16 changes: 13 additions & 3 deletions drivers/gpu/drm/v3d/v3d_irq.c
Expand Up @@ -4,9 +4,9 @@
/**
* DOC: Interrupt management for the V3D engine
*
* When we take a bin, render, or TFU done interrupt, we need to
* signal the fence for that job so that the scheduler can queue up
* the next one and unblock any waiters.
* When we take a bin, render, TFU done, or CSD done interrupt, we
* need to signal the fence for that job so that the scheduler can
* queue up the next one and unblock any waiters.
*
* When we take the binner out of memory interrupt, we need to
* allocate some new memory and pass it to the binner so that the
Expand All @@ -20,6 +20,7 @@
#define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \
V3D_INT_FLDONE | \
V3D_INT_FRDONE | \
V3D_INT_CSDDONE | \
V3D_INT_GMPV))

#define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \
Expand Down Expand Up @@ -112,6 +113,15 @@ v3d_irq(int irq, void *arg)
status = IRQ_HANDLED;
}

if (intsts & V3D_INT_CSDDONE) {
struct v3d_fence *fence =
to_v3d_fence(v3d->csd_job->base.irq_fence);

trace_v3d_csd_irq(&v3d->drm, fence->seqno);
dma_fence_signal(&fence->base);
status = IRQ_HANDLED;
}

/* We shouldn't be triggering these if we have GMP in
* always-allowed mode.
*/
Expand Down

0 comments on commit 58e06f6

Please sign in to comment.