Skip to content

Commit

Permalink
Merge pull request #688 from beehive-lab/dev/misc/juan
Browse files Browse the repository at this point in the history
Documentation + Thread Scheduler updated
  • Loading branch information
jjfumero committed Feb 16, 2021
2 parents 7a91ce2 + d7e755c commit e8c3161
Show file tree
Hide file tree
Showing 31 changed files with 264 additions and 123 deletions.
2 changes: 2 additions & 0 deletions assembly/src/docs/TORNADOVM_TESTED_DRIVERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

The following drivers have been tested on Linux >= CentOS 7.3

* 21.06.18993: OK ( OpenCL 3.0 )
* 21.05.18936: OK ( OpenCL 3.0 )
* 21.04.18912: OK ( OpenCL 3.0 )
* 21.03.18857: OK ( OpenCL 3.0 )
* 21.02.18820: OK ( OpenCL 3.0 )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@
import uk.ac.manchester.tornado.runtime.common.TornadoOptions;
import uk.ac.manchester.tornado.runtime.tasks.meta.TaskMetaData;

import java.util.Arrays;

public abstract class OCLKernelScheduler {

protected final OCLDeviceContext deviceContext;

protected double min;
protected double max;

public static final String WARNING_THREAD_LOCAL = "[TornadoVM OCL] Warning: TornadoVM changed the user-defined local size to null. Now, the OpenCL driver will select the best configuration.";

OCLKernelScheduler(final OCLDeviceContext context) {
deviceContext = context;
}
Expand Down Expand Up @@ -69,35 +69,51 @@ public int launch(final OCLKernel kernel, final TaskMetaData meta, final int[] w
long[] global = grid.getGlobalWork();
long[] offset = grid.getGlobalOffset();
long[] local = grid.getLocalWork();
if (local != null) {
OCLGridInfo gridInfo = new OCLGridInfo(deviceContext.getDevice(), local);
boolean checkedDimensions = gridInfo.checkGridDimensions();
if (!checkedDimensions) {
System.out.println("Warning: TornadoVM changed the user-defined local size to null. Now, the OpenCL driver will select the best configuration.");
local = null;
}
}
if (meta.isDebug()) {
meta.printThreadDims(local, null);
}
return deviceContext.enqueueNDRangeKernel(kernel, grid.dimension(), offset, global, local, waitEvents);
} else {
return deviceContext.enqueueNDRangeKernel(kernel, meta.getDims(), meta.getGlobalOffset(), meta.getGlobalWork(), (meta.shouldUseOpenCLDriverScheduling() ? null : meta.getLocalWork()),
waitEvents);
}
}

public int submit(final OCLKernel kernel, final TaskMetaData meta, final int[] waitEvents, long batchThreads) {
/**
* Checks if the selected local work group fits on the target device. If it does
* not fit, it sets the local work group to null, so the OpenCL driver chooses a
* default value. In this case, the threads configured in the local work sizes
* depends on each OpenCL driver.
*
* @param meta
* TaskMetaData.
*/
private void checkLocalWorkGroupFitsOnDevice(final TaskMetaData meta) {
WorkerGrid grid = meta.getWorkerGrid(meta.getId());
long[] local = grid.getLocalWork();
if (local != null) {
OCLGridInfo gridInfo = new OCLGridInfo(deviceContext.getDevice(), local);
boolean checkedDimensions = gridInfo.checkGridDimensions();
if (!checkedDimensions) {
System.out.println(WARNING_THREAD_LOCAL);
grid.setLocalWorkToNull();
grid.setNumberOfWorkgroupsToNull();
}
}
}

public int submit(final OCLKernel kernel, final TaskMetaData meta, final int[] waitEvents, long batchThreads) {
if (!meta.isWorkerGridAvailable()) {
if (!meta.isGlobalWorkDefined()) {
calculateGlobalWork(meta, batchThreads);
}
if (!meta.isLocalWorkDefined()) {
calculateLocalWork(meta);
}
} else {
checkLocalWorkGroupFitsOnDevice(meta);
}

if (meta.isDebug()) {
meta.printThreadDims();
}
final int taskEvent = launch(kernel, meta, waitEvents, batchThreads);
updateProfiler(taskEvent, meta);
return taskEvent;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,9 @@ public int submitWithEvents(final OCLCallStack stack, final ObjectBuffer atomicS
} else {
if (meta.isDebug()) {
System.out.println("Running on: ");
System.out.println("\tPlatform: " + meta.getDevice().getPlatformName());
if (meta.getDevice() instanceof OCLTornadoDevice) {
System.out.println("\tDevice : " + ((OCLTornadoDevice) meta.getDevice()).getDevice().getDeviceName());
System.out.println("\tPlatform: " + meta.getLogicDevice().getPlatformName());
if (meta.getLogicDevice() instanceof OCLTornadoDevice) {
System.out.println("\tDevice : " + ((OCLTornadoDevice) meta.getLogicDevice()).getPhysicalDevice().getDeviceName());
}
}
if (meta.getGlobalWork() == null) {
Expand Down Expand Up @@ -298,7 +298,7 @@ private void executeSingleThread() {

private void debugInfo(final TaskMetaData meta) {
if (meta.isDebug()) {
meta.printThreadDims(null, null);
meta.printThreadDims();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ public int[] getDriverAndDevice() {
int deviceIndex = 0;
for (int i = 0; i < numDev; i++) {
TornadoAcceleratorDevice device = TornadoCoreRuntime.getTornadoRuntime().getDriver(OCLDriver.class).getDevice(i);
OCLTargetDevice dev = (OCLTargetDevice) device.getDevice();
OCLTargetDevice dev = (OCLTargetDevice) device.getPhysicalDevice();
if (dev == deviceContext.getDevice()) {
deviceIndex = i;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ protected void run(StructuredGraph graph, TornadoHighTierContext context) {

TornadoAcceleratorDevice device = context.getDeviceMapping();
final TornadoSchedulingStrategy strategy = device.getPreferredSchedule();
long[] maxWorkItemSizes = device.getDevice().getDeviceMaxWorkItemSizes();
long[] maxWorkItemSizes = device.getPhysicalDevice().getDeviceMaxWorkItemSizes();

graph.getNodes().filter(ParallelRangeNode.class).forEach(node -> {
if (context.getMeta().enableParallelization() && maxWorkItemSizes[node.index()] > 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.SchedulableTask;
import uk.ac.manchester.tornado.api.enums.TornadoDeviceType;
import uk.ac.manchester.tornado.api.enums.TornadoVMBackend;
import uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRuntimeException;
import uk.ac.manchester.tornado.api.exceptions.TornadoInternalError;
import uk.ac.manchester.tornado.api.exceptions.TornadoMemoryException;
Expand Down Expand Up @@ -137,7 +138,7 @@ public String getPlatformName() {
}

@Override
public OCLTargetDevice getDevice() {
public OCLTargetDevice getPhysicalDevice() {
return device;
}

Expand Down Expand Up @@ -248,8 +249,8 @@ private TornadoInstalledCode compileTask(SchedulableTask task) {
OCLProviders providers = (OCLProviders) getBackend().getProviders();
TornadoProfiler profiler = task.getProfiler();
// profiler
profiler.registerDeviceID(ProfilerType.DEVICE_ID, taskMeta.getId(), taskMeta.getDevice().getDriverIndex() + ":" + taskMeta.getDeviceIndex());
profiler.registerDeviceName(ProfilerType.DEVICE, taskMeta.getId(), taskMeta.getDevice().getDevice().getDeviceName());
profiler.registerDeviceID(ProfilerType.DEVICE_ID, taskMeta.getId(), taskMeta.getLogicDevice().getDriverIndex() + ":" + taskMeta.getDeviceIndex());
profiler.registerDeviceName(ProfilerType.DEVICE, taskMeta.getId(), taskMeta.getLogicDevice().getPhysicalDevice().getDeviceName());
profiler.start(ProfilerType.TASK_COMPILE_GRAAL_TIME, taskMeta.getId());
final OCLCompilationResult result = OCLCompiler.compileSketchForDevice(sketch, executable, providers, getBackend());

Expand Down Expand Up @@ -787,4 +788,9 @@ public void enableThreadSharing() {
public void setAtomicRegion(ObjectBuffer bufferAtomics) {
reuseBuffer = bufferAtomics;
}

@Override
public TornadoVMBackend getTornadoVMBackend() {
return TornadoVMBackend.OpenCL;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.SchedulableTask;
import uk.ac.manchester.tornado.api.enums.TornadoDeviceType;
import uk.ac.manchester.tornado.api.enums.TornadoVMBackend;
import uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRuntimeException;
import uk.ac.manchester.tornado.api.exceptions.TornadoInternalError;
import uk.ac.manchester.tornado.api.mm.ObjectBuffer;
Expand Down Expand Up @@ -107,7 +108,7 @@ public String getPlatformName() {
}

@Override
public OCLTargetDevice getDevice() {
public OCLTargetDevice getPhysicalDevice() {
return device;
}

Expand Down Expand Up @@ -449,4 +450,9 @@ public Object getAtomic() {
public void setAtomicsMapping(ConcurrentHashMap<Object, Integer> mappingAtomics) {

}

@Override
public TornadoVMBackend getTornadoVMBackend() {
return TornadoVMBackend.VIRTUAL;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,11 @@ public int enqueueKernelLaunch(PTXModule module, byte[] kernelParams, int[] grid
assert Arrays.stream(blockDim).filter(i -> i <= 0).count() == 0;

if (module.metaData.isDebug()) {
module.metaData.printThreadDims(Arrays.stream(blockDim).mapToLong(i -> i).toArray(), Arrays.stream(gridDim).mapToLong(i -> i).toArray());
long[] blockDims = Arrays.stream(blockDim).mapToLong(i -> i).toArray();
long[] gridDims = Arrays.stream(gridDim).mapToLong(i -> i).toArray();
module.metaData.setPtxBlockDim(blockDims);
module.metaData.setPtxGridDim(gridDims);
module.metaData.printThreadDims();
}

return registerEvent(cuLaunchKernel(module.moduleWrapper, module.kernelFunctionName, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], DYNAMIC_SHARED_MEMORY_BYTES,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,10 @@
import org.graalvm.compiler.phases.common.IterativeConditionalEliminationPhase;
import org.graalvm.compiler.phases.common.LoweringPhase;
import org.graalvm.compiler.phases.common.RemoveValueProxyPhase;
import org.graalvm.compiler.phases.common.UseTrappingNullChecksPhase;
import org.graalvm.compiler.phases.schedule.SchedulePhase;
import uk.ac.manchester.tornado.api.TornadoDeviceContext;
import uk.ac.manchester.tornado.drivers.common.graal.compiler.DumpLowTierGraph;
import uk.ac.manchester.tornado.drivers.ptx.graal.phases.PTXMulAddPhase;
import uk.ac.manchester.tornado.drivers.ptx.graal.phases.PTXFMAPhase;
import uk.ac.manchester.tornado.runtime.common.TornadoOptions;
import uk.ac.manchester.tornado.runtime.graal.compiler.TornadoLowTier;
import uk.ac.manchester.tornado.runtime.graal.phases.TornadoFeatureExtraction;
Expand Down Expand Up @@ -74,7 +73,7 @@ public PTXLowTier(OptionValues options, TornadoDeviceContext tornadoDeviceContex

appendPhase(new TornadoLoopCanonicalization());

appendPhase(new PTXMulAddPhase());
appendPhase(new PTXFMAPhase());

appendPhase(new SchedulePhase(SchedulePhase.SchedulingStrategy.LATEST_OUT_OF_LOOPS));

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, APT Group, Department of Computer Science,
* Copyright (c) 2020, 2021, APT Group, Department of Computer Science,
* School of Engineering, The University of Manchester. All rights reserved.
* Copyright (c) 2009, 2017, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
Expand Down Expand Up @@ -33,9 +33,9 @@
import jdk.vm.ci.meta.Value;
import uk.ac.manchester.tornado.drivers.ptx.graal.lir.PTXArithmeticTool;

@NodeInfo(shortName = "MulAdd")
public class PTXMultiplyAddNode extends FloatingNode implements ArithmeticLIRLowerable {
public static final NodeClass<PTXMultiplyAddNode> TYPE = NodeClass.create(PTXMultiplyAddNode.class);
@NodeInfo(shortName = "PTX-FMA")
public class PTXFMANode extends FloatingNode implements ArithmeticLIRLowerable {
public static final NodeClass<PTXFMANode> TYPE = NodeClass.create(PTXFMANode.class);

@Input
protected ValueNode x;
Expand All @@ -44,7 +44,7 @@ public class PTXMultiplyAddNode extends FloatingNode implements ArithmeticLIRLow
@Input
protected ValueNode z;

public PTXMultiplyAddNode(ValueNode x, ValueNode y, ValueNode z) {
public PTXFMANode(ValueNode x, ValueNode y, ValueNode z) {
super(TYPE, StampFactory.forKind(x.getStackKind()));

this.x = x;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, APT Group, Department of Computer Science,
* Copyright (c) 2020, 2021, APT Group, Department of Computer Science,
* School of Engineering, The University of Manchester. All rights reserved.
* Copyright (c) 2009, 2017, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
Expand Down Expand Up @@ -28,10 +28,10 @@
import org.graalvm.compiler.nodes.calc.MulNode;
import org.graalvm.compiler.phases.Phase;

import uk.ac.manchester.tornado.drivers.ptx.graal.nodes.PTXMultiplyAddNode;
import uk.ac.manchester.tornado.drivers.ptx.graal.nodes.PTXFMANode;
import uk.ac.manchester.tornado.drivers.ptx.graal.nodes.vector.VectorElementOpNode;

public class PTXMulAddPhase extends Phase {
public class PTXFMAPhase extends Phase {

@Override
protected void run(StructuredGraph graph) {
Expand All @@ -48,7 +48,7 @@ protected void run(StructuredGraph graph) {
return;
}

PTXMultiplyAddNode newNode = new PTXMultiplyAddNode(x, y, z);
PTXFMANode newNode = new PTXFMANode(x, y, z);
graph.addWithoutUnique(newNode);

mul.removeUsage(addNode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.graalvm.compiler.debug.DebugContext;
import org.graalvm.compiler.nodes.ConstantNode;
import org.graalvm.compiler.nodes.StructuredGraph;
import org.graalvm.compiler.nodes.ValueNode;
import org.graalvm.compiler.nodes.calc.AddNode;
import org.graalvm.compiler.nodes.calc.MulNode;
import org.graalvm.compiler.phases.BasePhase;
Expand Down Expand Up @@ -73,7 +72,7 @@ protected void run(StructuredGraph graph, TornadoHighTierContext context) {

PTXTornadoDevice device = (PTXTornadoDevice) context.getDeviceMapping();
final TornadoSchedulingStrategy strategy = device.getPreferredSchedule();
long[] maxWorkItemSizes = device.getDevice().getDeviceMaxWorkItemSizes();
long[] maxWorkItemSizes = device.getPhysicalDevice().getDeviceMaxWorkItemSizes();

graph.getNodes().filter(ParallelRangeNode.class).forEach(node -> {
if (context.getMeta().enableParallelization() && maxWorkItemSizes[node.index()] > 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import uk.ac.manchester.tornado.api.common.Event;
import uk.ac.manchester.tornado.api.common.SchedulableTask;
import uk.ac.manchester.tornado.api.enums.TornadoDeviceType;
import uk.ac.manchester.tornado.api.enums.TornadoVMBackend;
import uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRuntimeException;
import uk.ac.manchester.tornado.api.exceptions.TornadoInternalError;
import uk.ac.manchester.tornado.api.exceptions.TornadoMemoryException;
Expand Down Expand Up @@ -182,8 +183,8 @@ private TornadoInstalledCode compileTask(SchedulableTask task) {
if (!deviceContext.isCached(resolvedMethod.getName(), executable)) {
PTXProviders providers = (PTXProviders) getBackend().getProviders();
// profiler
profiler.registerDeviceID(ProfilerType.DEVICE_ID, taskMeta.getId(), taskMeta.getDevice().getDriverIndex() + ":" + taskMeta.getDeviceIndex());
profiler.registerDeviceName(ProfilerType.DEVICE, taskMeta.getId(), taskMeta.getDevice().getDevice().getDeviceName());
profiler.registerDeviceID(ProfilerType.DEVICE_ID, taskMeta.getId(), taskMeta.getLogicDevice().getDriverIndex() + ":" + taskMeta.getDeviceIndex());
profiler.registerDeviceName(ProfilerType.DEVICE, taskMeta.getId(), taskMeta.getLogicDevice().getPhysicalDevice().getDeviceName());
profiler.start(ProfilerType.TASK_COMPILE_GRAAL_TIME, taskMeta.getId());
result = PTXCompiler.compileSketchForDevice(sketch, executable, providers, getBackend());
profiler.stop(ProfilerType.TASK_COMPILE_GRAAL_TIME, taskMeta.getId());
Expand Down Expand Up @@ -587,7 +588,7 @@ public PTXBackend getBackend() {
}

@Override
public TornadoTargetDevice getDevice() {
public TornadoTargetDevice getPhysicalDevice() {
return device;
}

Expand Down Expand Up @@ -646,6 +647,11 @@ public void setAtomicsMapping(ConcurrentHashMap<Object, Integer> mappingAtomics)

}

@Override
public TornadoVMBackend getTornadoVMBackend() {
return TornadoVMBackend.PTX;
}

/**
* In CUDA the context is not attached to the whole process, but to individual
* threads Therefore, in the case of new threads executing a task schedule, we
Expand All @@ -665,4 +671,5 @@ public void setAtomicRegion(ObjectBuffer bufferAtomics) {
public String toString() {
return getPlatformName() + " -- " + device.getDeviceName();
}

}
Loading

0 comments on commit e8c3161

Please sign in to comment.