Skip to content
Permalink
Browse files
Adding MLCompute compute context to OD (#3276)
  • Loading branch information
shreyajain17 committed Jul 29, 2020
1 parent 4b3c2b3 commit 6716bbe
Show file tree
Hide file tree
Showing 23 changed files with 1,446 additions and 17 deletions.
@@ -222,6 +222,7 @@ if(APPLE)
find_library(CORE_VIDEO CoreVideo)
find_library(METAL NAMES Metal)
find_library(METAL_PERFORMANCE_SHADERS NAMES MetalPerformanceShaders)
find_library(MLCOMPUTE NAMES MLCompute)
set(_TC_APPLE_DEPENDENCIES
${ACCELERATE}
${CORE_GRAPHICS}
@@ -511,6 +512,11 @@ if(NOT TC_DISABLE_OBJECT_BUILDS)
${_TC_COMMON_REQUIREMENTS} ${ACCELERATE} ${CORE_IMAGE} ${METAL}
${METAL_PERFORMANCE_SHADERS})
endif()
if(HAS_ML_COMPUTE)
set(_TC_COMMON_OBJECTS ${_TC_COMMON_OBJECTS} "$<TARGET_OBJECTS:tcmlc>")
set(_TC_COMMON_REQUIREMENTS
${_TC_COMMON_REQUIREMENTS} ${ACCELERATE} ${CORE_IMAGE} ${MLCOMPUTE})
endif()
endif()
endif()

@@ -110,6 +110,12 @@ if(APPLE)
add_definitions(-DHAS_CORE_ML)
set(HAS_CORE_ML TRUE)
endif()

# MLCompute is only present on macOS 11.0 or higher.
if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.16)
add_definitions(-DHAS_ML_COMPUTE)
set(HAS_ML_COMPUTE TRUE)
endif()

# Core ML only supports batch inference on macOS 10.14 or higher
# Logic reversed to get around what seems to be a CMake bug.
@@ -128,6 +134,10 @@ if(APPLE)
if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.15)
add_definitions(-DHAS_MACOS_10_15)
endif()

if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.16)
add_definitions(-DHAS_MACOS_10_16)
endif()
endif()

endmacro()
@@ -72,6 +72,26 @@ else()
)
endif()

# When building for macOS with version higher than 11.0, build the MLCompute-based backend.
if(APPLE AND HAS_ML_COMPUTE AND NOT TC_BUILD_IOS)
make_library(tcmlc OBJECT
SOURCES
mlc_compute_context.mm
mlc_layer_weights.mm
mlc_od_backend.mm
mlc_utils.mm
TCMLComputeObjectDetectorDescriptor.m
TCMLComputeUtil.m
TCModelTrainerBackendGraphs.m
REQUIRES
unity_core
${ACCELERATE}
)
set(additional_unity_neural_net_requirements tcmlc)
else()
set(additional_unity_neural_net_requirements "")
endif()

make_library(unity_neural_net OBJECT
SOURCES
PortableImage.cpp
@@ -0,0 +1,37 @@
/* Copyright © 2020 Apple Inc. All rights reserved.
*
* Use of this source code is governed by a BSD-3-clause license that can
* be found in the LICENSE.txt file or at
* https://opensource.org/licenses/BSD-3-Clause
*/
#import <MLCompute/MLCompute.h>

NS_ASSUME_NONNULL_BEGIN

// Defines the parameters for the MLCompute-based implementation of the
// Object Detection model.
API_AVAILABLE(macos(10.16))
@interface TCMLComputeObjectDetectorDescriptor : NSObject

// Defines the shape of the graph's input.
@property(nonatomic) MLCTensor *inputTensor;

// Controls the number of features in the output tensor, which should be anchorBoxesCount * (5 +
// classesCount). For each output grid cell, for each anchor box, the output has x, y, h, w, object
// confidence, then the classesCount class likelihoods (conditional on an object being present).
@property(nonatomic) NSUInteger outputChannels;

// Dictionary mapping layer names to weights.
@property(nonatomic) NSDictionary<NSString *, MLCTensor *> *weights;

@end

API_AVAILABLE(macos(10.16))
@interface MLCGraph (TCMLComputeObjectDetector)

+ (instancetype)tc_graphForObjectDetectorDescriptor:
(TCMLComputeObjectDetectorDescriptor *)descriptor;

@end

NS_ASSUME_NONNULL_END
@@ -0,0 +1,142 @@
/* Copyright © 2020 Apple Inc. All rights reserved.
*
* Use of this source code is governed by a BSD-3-clause license that can
* be found in the LICENSE.txt file or at
* https://opensource.org/licenses/BSD-3-Clause
*/
#import <ml/neural_net/TCMLComputeObjectDetectorDescriptor.h>

#import <ml/neural_net/TCMLComputeUtil.h>

@implementation TCMLComputeObjectDetectorDescriptor

- (BOOL)isComplete
{
if (self.inputTensor == nil) return NO;
if (self.outputChannels == 0) return NO;
if (self.weights == nil) return NO;

return YES;
}

- (MLCConvolutionLayer *)convLayerForIndex:(NSUInteger)index
outputChannels:(NSUInteger)outputChannels
{
// Find the weights for this conv layer in our dictionary of parameters.
NSString *biasKey = [NSString stringWithFormat:@"conv%lu_bias", (unsigned long)index];
NSString *weightKey = [NSString stringWithFormat:@"conv%lu_weight", (unsigned long)index];
MLCTensor *bias = self.weights[biasKey];
MLCTensor *weights = self.weights[weightKey];

// Configure the convolution descriptor.
NSUInteger inputChannels =
weights.descriptor.shape[TCMLComputeTensorSizeChannels].unsignedIntegerValue / outputChannels;
NSUInteger kernelHeight =
weights.descriptor.shape[TCMLComputeTensorSizeHeight].unsignedIntegerValue;
NSUInteger kernelWidth =
weights.descriptor.shape[TCMLComputeTensorSizeWidth].unsignedIntegerValue;
MLCConvolutionDescriptor *descriptor =
[MLCConvolutionDescriptor descriptorWithKernelSizes:@[ @(kernelHeight), @(kernelWidth) ]
inputFeatureChannelCount:inputChannels
outputFeatureChannelCount:outputChannels
strides:@[ @1, @1 ]
paddingPolicy:MLCPaddingPolicySame
paddingSizes:@[ @1, @1 ]];

return [MLCConvolutionLayer layerWithWeights:weights biases:bias descriptor:descriptor];
}

- (MLCBatchNormalizationLayer *)batchNormLayerForIndex:(NSUInteger)index
{
// Find the weights for this batch norm layer in our dictionary of parameters.
NSString *gammaKey = [NSString stringWithFormat:@"batchnorm%lu_gamma", (unsigned long)index];
NSString *betaKey = [NSString stringWithFormat:@"batchnorm%lu_beta", (unsigned long)index];
NSString *varianceKey =
[NSString stringWithFormat:@"batchnorm%lu_running_var", (unsigned long)index];
NSString *meanKey =
[NSString stringWithFormat:@"batchnorm%lu_running_mean", (unsigned long)index];

MLCTensor *gamma = self.weights[gammaKey];
MLCTensor *beta = self.weights[betaKey];
MLCTensor *variance = self.weights[varianceKey];
MLCTensor *mean = self.weights[meanKey];

NSUInteger featureChannels =
mean.descriptor.shape[TCMLComputeTensorSizeChannels].unsignedIntegerValue;

return [MLCBatchNormalizationLayer layerWithFeatureChannelCount:featureChannels
mean:mean
variance:variance
beta:beta
gamma:gamma
varianceEpsilon:1e-5f
momentum:0.9f];
}

- (MLCTensor *)addCommonLayersWithIndex:(NSUInteger)index
outputChannels:(NSUInteger)outputChannels
source:(MLCTensor *)source
graph:(MLCTrainingGraph *)graph
{
// conv
MLCConvolutionLayer *convLayer = [self convLayerForIndex:index outputChannels:outputChannels];
MLCTensor *convTensor = [graph nodeWithLayer:convLayer source:source];

// batchnorm
MLCBatchNormalizationLayer *batchNormLayer = [self batchNormLayerForIndex:index];
MLCTensor *batchNormTensor = [graph nodeWithLayer:batchNormLayer source:convTensor];

// leakyrelu
MLCActivationDescriptor *leakyReLUDesc =
[MLCActivationDescriptor descriptorWithType:MLCActivationTypeReLU a:0.1f];
MLCActivationLayer *leakyReLULayer = [MLCActivationLayer layerWithDescriptor:leakyReLUDesc];
MLCTensor *leakyReLUTensor = [graph nodeWithLayer:leakyReLULayer source:batchNormTensor];

// pool
// On the first five blocks (0-4), pool with stride 2, reducing image dimensions by a factor of 2.
// On the sixth (5), pool with stride 1, preserving image dimensions.
if (index <= 5) {
NSUInteger poolingStride = index < 5 ? 2 : 1;
MLCPoolingDescriptor *poolingDesc =
[MLCPoolingDescriptor poolingDescriptorWithType:MLCPoolingTypeMax
kernelSize:2
stride:poolingStride];
MLCLayer *poolLayer = [MLCPoolingLayer layerWithDescriptor:poolingDesc];
MLCTensor *poolTensor = [graph nodeWithLayer:poolLayer source:leakyReLUTensor];
return poolTensor;
} else {
return leakyReLUTensor;
}
}

@end

@implementation MLCGraph (TCMLComputeObjectDetector)

+ (instancetype)tc_graphForObjectDetectorDescriptor:
(TCMLComputeObjectDetectorDescriptor *)descriptor
{
if (![descriptor isComplete]) return nil;

MLCTrainingGraph *graph = [[self alloc] init];

// Start with the 8 rounds of convolution, batch norm, and leaky ReLU layers. After the first 5
// rounds, max pooling reduces the image size by half each time.
NSUInteger channelCounts[] = {16, 32, 64, 128, 256, 512, 1024, 1024};
MLCTensor *tensor = descriptor.inputTensor;
for (NSUInteger i = 0; i < 8; ++i) {
tensor = [descriptor addCommonLayersWithIndex:i
outputChannels:channelCounts[i]
source:tensor
graph:graph];
}

// Add the final convolution layer, which maps the tensor into the YOLO representation.
MLCConvolutionLayer *conv8Layer = [descriptor convLayerForIndex:8
outputChannels:descriptor.outputChannels];
[graph nodeWithLayer:conv8Layer source:tensor];

return graph;
}

@end
@@ -0,0 +1,38 @@
/* Copyright © 2020 Apple Inc. All rights reserved.
*
* Use of this source code is governed by a BSD-3-clause license that can
* be found in the LICENSE.txt file or at
* https://opensource.org/licenses/BSD-3-Clause
*/
#import <Foundation/Foundation.h>
#import <MLCompute/MLCompute.h>

NS_ASSUME_NONNULL_BEGIN

#ifdef __cplusplus
extern "C" {
#endif

typedef NS_ENUM(int32_t, TCMLComputeTensorSize) {
TCMLComputeTensorSizeBatch,
TCMLComputeTensorSizeChannels,
TCMLComputeTensorSizeHeight,
TCMLComputeTensorSizeWidth,
};

/// Returns a buffer suitable to pass to MLCompute as memory into which to write an output tensor's
/// value. MLCompute requires this memory to be page-aligned in the GPU case.
API_AVAILABLE(macos(10.16))
NSData *TCAllocateDataForOutputTensor(MLCTensor *tensor, MLCDeviceType deviceType);

API_AVAILABLE(macos(10.16))
MLCTensorData *TCMLComputeWrapData(NSData *data);

API_AVAILABLE(macos(10.16))
MLCTensorData *TCMLComputeWrapBuffer(NSMutableData *data);

#ifdef __cplusplus
} // extern "C"
#endif

NS_ASSUME_NONNULL_END
@@ -0,0 +1,42 @@
/* Copyright © 2020 Apple Inc. All rights reserved.
*
* Use of this source code is governed by a BSD-3-clause license that can
* be found in the LICENSE.txt file or at
* https://opensource.org/licenses/BSD-3-Clause
*/
#import <ml/neural_net/TCMLComputeUtil.h>

NS_ASSUME_NONNULL_BEGIN

NSData *TCAllocateDataForOutputTensor(MLCTensor *tensor, MLCDeviceType deviceType)
{
NSData *result = nil;
if (deviceType == MLCDeviceTypeCPU) {
NSUInteger size = tensor.descriptor.tensorAllocationSizeInBytes;
result = [NSData dataWithBytesNoCopy:malloc(size) length:size freeWhenDone:YES];
} else {
// On GPU, we need to allocate a page-aligned buffer with size also divisible by page size for
// GPU devices to write to.
NSUInteger pageSize = (NSUInteger)(getpagesize());
NSUInteger numPages = (tensor.descriptor.tensorAllocationSizeInBytes + pageSize - 1) / pageSize;
NSUInteger size = numPages * pageSize;
result = [NSData dataWithBytesNoCopy:valloc(size) length:size freeWhenDone:YES];
// Yes, it would be conceptually cleaner if we returned NSMutableData instead of NSData, since
// we expect MLCompute to mutate the contents of the memory. But it turns out +[NSMutableData
// dataWithBytesNoCopy:length:] actually does copy the data (into an internally allocated buffer
// that is not necessarily page-aligned), despite the initializer name!
}
return result;
}

MLCTensorData *TCMLComputeWrapData(NSData *data)
{
return [MLCTensorData dataWithImmutableBytesNoCopy:data.bytes length:data.length];
}

MLCTensorData *TCMLComputeWrapBuffer(NSMutableData *data)
{
return [MLCTensorData dataWithBytesNoCopy:data.mutableBytes length:data.length];
}

NS_ASSUME_NONNULL_END

0 comments on commit 6716bbe

Please sign in to comment.