forked from googleapis/google-cloud-go
/
vision.go
354 lines (316 loc) · 11.3 KB
/
vision.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vision
import (
"image/color"
"math"
"cloud.google.com/go/internal/version"
vkit "cloud.google.com/go/vision/apiv1"
"golang.org/x/net/context"
"google.golang.org/api/option"
pb "google.golang.org/genproto/googleapis/cloud/vision/v1"
cpb "google.golang.org/genproto/googleapis/type/color"
)
// Scope is the OAuth2 scope required by the Google Cloud Vision API.
const Scope = "https://www.googleapis.com/auth/cloud-platform"
// Client is a Google Cloud Vision API client.
type Client struct {
client *vkit.ImageAnnotatorClient
}
// NewClient creates a new vision client.
func NewClient(ctx context.Context, opts ...option.ClientOption) (*Client, error) {
c, err := vkit.NewImageAnnotatorClient(ctx, opts...)
if err != nil {
return nil, err
}
c.SetGoogleClientInfo("gccl", version.Repo)
return &Client{client: c}, nil
}
// Close closes the client.
func (c *Client) Close() error {
return c.client.Close()
}
// Annotate annotates multiple images, each with a potentially different set
// of features.
func (c *Client) Annotate(ctx context.Context, requests ...*AnnotateRequest) ([]*Annotations, error) {
var reqs []*pb.AnnotateImageRequest
for _, r := range requests {
reqs = append(reqs, r.toProto())
}
res, err := c.client.BatchAnnotateImages(ctx, &pb.BatchAnnotateImagesRequest{Requests: reqs})
if err != nil {
return nil, err
}
var results []*Annotations
for _, res := range res.Responses {
results = append(results, annotationsFromProto(res))
}
return results, nil
}
// An AnnotateRequest specifies an image to annotate and the features to look for in that image.
type AnnotateRequest struct {
// Image is the image to annotate.
Image *Image
// MaxFaces is the maximum number of faces to detect in the image.
// Specifying a number greater than zero enables face detection.
MaxFaces int
// MaxLandmarks is the maximum number of landmarks to detect in the image.
// Specifying a number greater than zero enables landmark detection.
MaxLandmarks int
// MaxLogos is the maximum number of logos to detect in the image.
// Specifying a number greater than zero enables logo detection.
MaxLogos int
// MaxLabels is the maximum number of labels to detect in the image.
// Specifying a number greater than zero enables labels detection.
MaxLabels int
// MaxTexts is the maximum number of separate pieces of text to detect in the
// image. Specifying a number greater than zero enables text detection.
MaxTexts int
// DocumentText specifies whether a dense text document OCR should be run
// on the image. When true, takes precedence over MaxTexts.
DocumentText bool
// SafeSearch specifies whether a safe-search detection should be run on the image.
SafeSearch bool
// ImageProps specifies whether image properties should be obtained for the image.
ImageProps bool
// Web specifies whether web annotations should be obtained for the image.
Web bool
// CropHints specifies whether crop hints should be computed for the image.
CropHints *CropHintsParams
}
func (ar *AnnotateRequest) toProto() *pb.AnnotateImageRequest {
img, ictx := ar.Image.toProtos()
var features []*pb.Feature
add := func(typ pb.Feature_Type, max int) {
var mr int32
if max > math.MaxInt32 {
mr = math.MaxInt32
} else {
mr = int32(max)
}
features = append(features, &pb.Feature{Type: typ, MaxResults: mr})
}
if ar.MaxFaces > 0 {
add(pb.Feature_FACE_DETECTION, ar.MaxFaces)
}
if ar.MaxLandmarks > 0 {
add(pb.Feature_LANDMARK_DETECTION, ar.MaxLandmarks)
}
if ar.MaxLogos > 0 {
add(pb.Feature_LOGO_DETECTION, ar.MaxLogos)
}
if ar.MaxLabels > 0 {
add(pb.Feature_LABEL_DETECTION, ar.MaxLabels)
}
if ar.MaxTexts > 0 {
add(pb.Feature_TEXT_DETECTION, ar.MaxTexts)
}
if ar.DocumentText {
add(pb.Feature_DOCUMENT_TEXT_DETECTION, 0)
}
if ar.SafeSearch {
add(pb.Feature_SAFE_SEARCH_DETECTION, 0)
}
if ar.ImageProps {
add(pb.Feature_IMAGE_PROPERTIES, 0)
}
if ar.Web {
add(pb.Feature_WEB_DETECTION, 0)
}
if ar.CropHints != nil {
add(pb.Feature_CROP_HINTS, 0)
if ictx == nil {
ictx = &pb.ImageContext{}
}
ictx.CropHintsParams = &pb.CropHintsParams{
AspectRatios: ar.CropHints.AspectRatios,
}
}
return &pb.AnnotateImageRequest{
Image: img,
Features: features,
ImageContext: ictx,
}
}
// CropHintsParams are parameters for a request for crop hints.
type CropHintsParams struct {
// Aspect ratios for desired crop hints, representing the ratio of the
// width to the height of the image. For example, if the desired aspect
// ratio is 4:3, the corresponding float value should be 1.33333. If not
// specified, the best possible crop is returned. The number of provided
// aspect ratios is limited to a maximum of 16; any aspect ratios provided
// after the 16th are ignored.
AspectRatios []float32
}
// Called for a single image and a single feature.
func (c *Client) annotateOne(ctx context.Context, req *AnnotateRequest) (*Annotations, error) {
annsSlice, err := c.Annotate(ctx, req)
if err != nil {
return nil, err
}
anns := annsSlice[0]
// When there is only one image and one feature, the Annotations.Error field is
// unambiguously about that one detection, so we "promote" it to the error return value.
return anns, anns.Error
}
// TODO(jba): add examples for all single-feature functions (below).
// DetectFaces performs face detection on the image.
// At most maxResults results are returned.
func (c *Client) DetectFaces(ctx context.Context, img *Image, maxResults int) ([]*FaceAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, MaxFaces: maxResults})
if err != nil {
return nil, err
}
return anns.Faces, nil
}
// DetectLandmarks performs landmark detection on the image.
// At most maxResults results are returned.
func (c *Client) DetectLandmarks(ctx context.Context, img *Image, maxResults int) ([]*EntityAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, MaxLandmarks: maxResults})
if err != nil {
return nil, err
}
return anns.Landmarks, nil
}
// DetectLogos performs logo detection on the image.
// At most maxResults results are returned.
func (c *Client) DetectLogos(ctx context.Context, img *Image, maxResults int) ([]*EntityAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, MaxLogos: maxResults})
if err != nil {
return nil, err
}
return anns.Logos, nil
}
// DetectLabels performs label detection on the image.
// At most maxResults results are returned.
func (c *Client) DetectLabels(ctx context.Context, img *Image, maxResults int) ([]*EntityAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, MaxLabels: maxResults})
if err != nil {
return nil, err
}
return anns.Labels, nil
}
// DetectTexts performs text detection on the image.
// At most maxResults results are returned.
func (c *Client) DetectTexts(ctx context.Context, img *Image, maxResults int) ([]*EntityAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, MaxTexts: maxResults})
if err != nil {
return nil, err
}
return anns.Texts, nil
}
// DetectDocumentText performs full text (OCR) detection on the image.
func (c *Client) DetectDocumentText(ctx context.Context, img *Image) (*TextAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, DocumentText: true})
if err != nil {
return nil, err
}
return anns.FullText, nil
}
// DetectSafeSearch performs safe-search detection on the image.
func (c *Client) DetectSafeSearch(ctx context.Context, img *Image) (*SafeSearchAnnotation, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, SafeSearch: true})
if err != nil {
return nil, err
}
return anns.SafeSearch, nil
}
// DetectImageProps computes properties of the image.
func (c *Client) DetectImageProps(ctx context.Context, img *Image) (*ImageProps, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, ImageProps: true})
if err != nil {
return nil, err
}
return anns.ImageProps, nil
}
// DetectWeb computes a web annotation on the image.
func (c *Client) DetectWeb(ctx context.Context, img *Image) (*WebDetection, error) {
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, Web: true})
if err != nil {
return nil, err
}
return anns.Web, nil
}
// CropHints computes crop hints for the image.
func (c *Client) CropHints(ctx context.Context, img *Image, params *CropHintsParams) ([]*CropHint, error) {
// A nil AnnotateRequest.CropHints means do not perform CropHints. But
// here the user is explicitly asking for CropHints, so treat nil as
// an empty CropHintsParams.
if params == nil {
params = &CropHintsParams{}
}
anns, err := c.annotateOne(ctx, &AnnotateRequest{Image: img, CropHints: params})
if err != nil {
return nil, err
}
return anns.CropHints, nil
}
// A Likelihood is an approximate representation of a probability.
type Likelihood int
const (
// LikelihoodUnknown means the likelihood is unknown.
LikelihoodUnknown = Likelihood(pb.Likelihood_UNKNOWN)
// VeryUnlikely means the image is very unlikely to belong to the feature specified.
VeryUnlikely = Likelihood(pb.Likelihood_VERY_UNLIKELY)
// Unlikely means the image is unlikely to belong to the feature specified.
Unlikely = Likelihood(pb.Likelihood_UNLIKELY)
// Possible means the image possibly belongs to the feature specified.
Possible = Likelihood(pb.Likelihood_POSSIBLE)
// Likely means the image is likely to belong to the feature specified.
Likely = Likelihood(pb.Likelihood_LIKELY)
// VeryLikely means the image is very likely to belong to the feature specified.
VeryLikely = Likelihood(pb.Likelihood_VERY_LIKELY)
)
// A Property is an arbitrary name-value pair.
type Property struct {
Name string
Value string
}
func propertyFromProto(p *pb.Property) Property {
return Property{Name: p.Name, Value: p.Value}
}
// ColorInfo consists of RGB channels, score and fraction of
// image the color occupies in the image.
type ColorInfo struct {
// RGB components of the color.
Color color.NRGBA64
// Score is the image-specific score for this color, in the range [0, 1].
Score float32
// PixelFraction is the fraction of pixels the color occupies in the image,
// in the range [0, 1].
PixelFraction float32
}
func colorInfoFromProto(ci *pb.ColorInfo) *ColorInfo {
return &ColorInfo{
Color: colorFromProto(ci.Color),
Score: ci.Score,
PixelFraction: ci.PixelFraction,
}
}
// Should this go into protobuf/ptypes? The color proto is in google/types, so
// not specific to this API.
func colorFromProto(c *cpb.Color) color.NRGBA64 {
// Convert a color component from [0.0, 1.0] to a uint16.
cvt := func(f float32) uint16 { return uint16(f*math.MaxUint16 + 0.5) }
var alpha float32 = 1
if c.Alpha != nil {
alpha = c.Alpha.Value
}
return color.NRGBA64{
R: cvt(c.Red),
G: cvt(c.Green),
B: cvt(c.Blue),
A: cvt(alpha),
}
}