forked from minio/minio
/
csv.go
339 lines (292 loc) · 9.67 KB
/
csv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package csv
import (
"encoding/csv"
"encoding/xml"
"io"
"strconv"
"strings"
"github.com/tidwall/sjson"
"github.com/minio/minio/pkg/ioutil"
"github.com/minio/minio/pkg/s3select/format"
)
// Options options are passed to the underlying encoding/csv reader.
type Options struct {
// HasHeader when true, will treat the first row as a header row.
HasHeader bool
// RecordDelimiter is the string that records are delimited by.
RecordDelimiter string
// FieldDelimiter is the string that fields are delimited by.
FieldDelimiter string
// Comments is the string the first character of a line of
// text matches the comment character.
Comments string
// Name of the table that is used for querying
Name string
// ReadFrom is where the data will be read from.
ReadFrom io.Reader
// If true then we need to add gzip or bzip reader.
// to extract the csv.
Compressed string
// SQL expression meant to be evaluated.
Expression string
// Output CSV will be delimited by.
OutputFieldDelimiter string
// Output CSV record will be delimited by.
OutputRecordDelimiter string
// Size of incoming object
StreamSize int64
// Whether Header is "USE" or another
HeaderOpt bool
// Progress enabled, enable/disable progress messages.
Progress bool
// Output format type, supported values are CSV and JSON
OutputType format.Type
}
// cinput represents a record producing input from a formatted object.
type cinput struct {
options *Options
reader *csv.Reader
firstRow []string
header []string
minOutputLength int
stats struct {
BytesScanned int64
BytesReturned int64
BytesProcessed int64
}
}
// New sets up a new Input, the first row is read when this is run.
// If there is a problem with reading the first row, the error is returned.
// Otherwise, the returned reader can be reliably consumed with Read().
// until Read() return err.
func New(opts *Options) (format.Select, error) {
// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
normalizedReader := ioutil.NewDelimitedReader(opts.ReadFrom, []rune(opts.RecordDelimiter))
reader := &cinput{
options: opts,
reader: csv.NewReader(normalizedReader),
}
reader.stats.BytesScanned = opts.StreamSize
reader.stats.BytesProcessed = 0
reader.stats.BytesReturned = 0
reader.firstRow = nil
reader.reader.FieldsPerRecord = -1
if reader.options.FieldDelimiter != "" {
reader.reader.Comma = rune(reader.options.FieldDelimiter[0])
}
if reader.options.Comments != "" {
reader.reader.Comment = rune(reader.options.Comments[0])
}
// QuoteCharacter - " (defaulted currently)
reader.reader.LazyQuotes = true
if err := reader.readHeader(); err != nil {
return nil, err
}
return reader, nil
}
// Replace the spaces in columnnames with underscores
func cleanHeader(columns []string) []string {
for i := range columns {
// Even if header name is specified, some CSV's
// might have column header names might be empty
// and non-empty. In such a scenario we prepare
// indexed value.
if columns[i] == "" {
columns[i] = "_" + strconv.Itoa(i)
}
columns[i] = strings.Replace(columns[i], " ", "_", -1)
}
return columns
}
// readHeader reads the header into the header variable if the header is present
// as the first row of the csv
func (reader *cinput) readHeader() error {
var readErr error
if reader.options.HasHeader {
reader.firstRow, readErr = reader.reader.Read()
if readErr != nil {
return format.ErrCSVParsingError
}
reader.header = cleanHeader(reader.firstRow)
reader.firstRow = nil
} else {
reader.firstRow, readErr = reader.reader.Read()
if readErr != nil {
return format.ErrCSVParsingError
}
reader.header = make([]string, len(reader.firstRow))
for i := range reader.firstRow {
reader.header[i] = "_" + strconv.Itoa(i)
}
}
reader.minOutputLength = len(reader.header)
return nil
}
// Progress - return true if progress was requested.
func (reader *cinput) Progress() bool {
return reader.options.Progress
}
// UpdateBytesProcessed - populates the bytes Processed
func (reader *cinput) UpdateBytesProcessed(size int64) {
reader.stats.BytesProcessed += size
}
// Read returns byte sequence
func (reader *cinput) Read() ([]byte, error) {
dec := reader.readRecord()
if dec != nil {
var data []byte
var err error
// Navigate column values in reverse order to preserve
// the input order for AWS S3 compatibility, because
// sjson adds json key/value pairs in first in last out
// fashion. This should be fixed in sjson ideally. Following
// work around is needed to circumvent this issue for now.
for i := len(dec) - 1; i >= 0; i-- {
data, err = sjson.SetBytes(data, reader.header[i], dec[i])
if err != nil {
return nil, err
}
}
return data, nil
}
return nil, nil
}
// OutputFieldDelimiter - returns the requested output field delimiter.
func (reader *cinput) OutputFieldDelimiter() string {
return reader.options.OutputFieldDelimiter
}
// OutputRecordDelimiter - returns the requested output record delimiter.
func (reader *cinput) OutputRecordDelimiter() string {
return reader.options.OutputRecordDelimiter
}
// HasHeader - returns true or false depending upon the header.
func (reader *cinput) HasHeader() bool {
return reader.options.HasHeader
}
// Expression - return the Select Expression for
func (reader *cinput) Expression() string {
return reader.options.Expression
}
// UpdateBytesReturned - updates the Bytes returned for
func (reader *cinput) UpdateBytesReturned(size int64) {
reader.stats.BytesReturned += size
}
// Header returns the header of the reader. Either the first row if a header
// set in the options, or c#, where # is the column number, starting with 0.
func (reader *cinput) Header() []string {
return reader.header
}
// readRecord reads a single record from the stream and it always returns successfully.
// If the record is empty, an empty []string is returned.
// Record expand to match the current row size, adding blank fields as needed.
// Records never return less then the number of fields in the first row.
// Returns nil on EOF
// In the event of a parse error due to an invalid record, it is logged, and
// an empty []string is returned with the number of fields in the first row,
// as if the record were empty.
//
// In general, this is a very tolerant of problems reader.
func (reader *cinput) readRecord() []string {
var row []string
var fileErr error
if reader.firstRow != nil {
row = reader.firstRow
reader.firstRow = nil
return row
}
row, fileErr = reader.reader.Read()
emptysToAppend := reader.minOutputLength - len(row)
if fileErr == io.EOF || fileErr == io.ErrClosedPipe {
return nil
} else if _, ok := fileErr.(*csv.ParseError); ok {
emptysToAppend = reader.minOutputLength
}
if emptysToAppend > 0 {
for counter := 0; counter < emptysToAppend; counter++ {
row = append(row, "")
}
}
return row
}
// CreateStatXML is the function which does the marshaling from the stat
// structs into XML so that the progress and stat message can be sent
func (reader *cinput) CreateStatXML() (string, error) {
if reader.options.Compressed == "NONE" {
reader.stats.BytesProcessed = reader.options.StreamSize
reader.stats.BytesScanned = reader.stats.BytesProcessed
}
out, err := xml.Marshal(&format.Stats{
BytesScanned: reader.stats.BytesScanned,
BytesProcessed: reader.stats.BytesProcessed,
BytesReturned: reader.stats.BytesReturned,
})
if err != nil {
return "", err
}
return xml.Header + string(out), nil
}
// CreateProgressXML is the function which does the marshaling from the progress
// structs into XML so that the progress and stat message can be sent
func (reader *cinput) CreateProgressXML() (string, error) {
if reader.options.HasHeader {
reader.stats.BytesProcessed += format.ProcessSize(reader.header)
}
if reader.options.Compressed == "NONE" {
reader.stats.BytesScanned = reader.stats.BytesProcessed
}
out, err := xml.Marshal(&format.Progress{
BytesScanned: reader.stats.BytesScanned,
BytesProcessed: reader.stats.BytesProcessed,
BytesReturned: reader.stats.BytesReturned,
})
if err != nil {
return "", err
}
return xml.Header + string(out), nil
}
// Type - return the data format type
func (reader *cinput) Type() format.Type {
return format.CSV
}
// OutputType - return the data format type
func (reader *cinput) OutputType() format.Type {
return reader.options.OutputType
}
// ColNameErrs is a function which makes sure that the headers are requested are
// present in the file otherwise it throws an error.
func (reader *cinput) ColNameErrs(columnNames []string) error {
for i := 0; i < len(columnNames); i++ {
if columnNames[i] == "" {
continue
}
if !format.IsInt(columnNames[i]) && !reader.options.HeaderOpt {
return format.ErrInvalidColumnIndex
}
if format.IsInt(columnNames[i]) {
tempInt, _ := strconv.Atoi(columnNames[i])
if tempInt > len(reader.Header()) || tempInt == 0 {
return format.ErrInvalidColumnIndex
}
} else {
if reader.options.HeaderOpt && !format.StringInSlice(columnNames[i], reader.Header()) {
return format.ErrParseInvalidPathComponent
}
}
}
return nil
}