-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExcelDataStreams.jl
97 lines (79 loc) · 2.84 KB
/
ExcelDataStreams.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
module ExcelDataStreams
# package code goes here
using DataStreams
using Taro
using DataFrames
using DataArrays
export ExcelDataStream
initialized = false
type ExcelDataStream <: Data.Source
data::DataFrame
end
"""
`ExcelDataStream( filename::AbstractString; kwargs... )`
Constructor for type which gives access to an Excel file as a DataStream
### Arguments
* `filename::AbstractString` : the name of the Excel file to stream
### Keyword Arguments
* `columnindices::Vector{Int}` -- The indices of columns to use as fields. Defaults to empty.
* `names::Vector{Symbol}` -- The names to use for the fields. Defaults to empty.
* `eltypes::Vector{DataTypes}` -- The types expected for each field. Conversion will be attemped. Defaults to empty.
* `sheet::String` -- Which sheet of the Excel file contains the data of interest. Defaults to `sheet1`.
* `skipstart::Int` -- How many initial rows of the Excel file to skip. Defaults to `0`.
* `nastrings::Vector{String}` -- Strings to consider to be missing data. Defaults to empty.
"""
function ExcelDataStream(
filename::AbstractString,
;
columnindices::Vector{Int} = Int[],
names::Vector{Symbol} = String[],
eltypes::Vector{DataType} = DataType[],
sheet::String = "Sheet1",
skipstart::Int = 0,
nastrings::Vector{String} = String[],
)
global initialized
if !initialized
Taro.init()
initialized = true
end
excel = Workbook( filename )
sheetdata = getSheet( excel, sheet )
columns = [DataArray( eltype, 0 ) for eltype in eltypes]
done = false
rownum = skipstart
while !done
row = getRow( sheetdata, rownum )
done = isnull(row)
if !done
for i = 1:length(names)
cell = getCell( row, columnindices[i] )
cellvalue = getCellValue( cell )
if cellvalue == nothing || cellvalue in nastrings
push!( columns[i], missing )
else
push!( columns[i], cellvalue )
done = false
end
end
if done
# the last row is all missing
[pop!( column ) for column in columns]
end
rownum += 1
end
end
df = DataFrame()
for i = 1:length(names)
df[names[i]] = columns[i]
end
return ExcelDataStream( df )
end
Data.schema( ed::ExcelDataStream ) =
Data.Schema( eltypes(ed.data), string.(names(ed.data)), size(ed.data,1) )
Data.isdone( ed::ExcelDataStream, row::Int, col::Int ) = any((row,col).>size(ed.data))
Data.streamtype( ::Type{ExcelDataStream}, ::Type{Data.Field} ) = true
Data.streamfrom(ed::ExcelDataStream, ::Type{Data.Field}, ::Type{T}, row::Int, col::Int) where {T} =
T(ed.data[row,col])
Data.accesspattern( ::ExcelDataStream ) = Data.RandomAccess
end # module