Skip to content

Commit b8e1177

Browse files
authored
[Feature][Connector-V2] Support use EasyExcel as read excel engine (#8064)
1 parent 37612d9 commit b8e1177

File tree

20 files changed

+897
-158
lines changed

20 files changed

+897
-158
lines changed

docs/en/connector-v2/sink/LocalFile.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ By default, we use 2PC commit to ensure `exactly-once`
3333

3434
## Options
3535

36-
| Name | Type | Required | Default | Description |
36+
| Name | Type | Required | Default | Description |
3737
|---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------|
3838
| path | string | yes | - | |
3939
| tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. |

docs/en/connector-v2/source/LocalFile.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you
5656
| skip_header_row_number | long | no | 0 |
5757
| schema | config | no | - |
5858
| sheet_name | string | no | - |
59+
| excel_engine | string | no | POI | |
5960
| xml_row_tag | string | no | - |
6061
| xml_use_attr_format | boolean | no | - |
6162
| file_filter_pattern | string | no | |
@@ -239,6 +240,16 @@ Only need to be configured when file_format is excel.
239240

240241
Reader the sheet of the workbook.
241242

243+
### excel_engine [string]
244+
245+
Only need to be configured when file_format is excel.
246+
247+
supported as the following file types:
248+
`POI` `EasyExcel`
249+
250+
The default excel reading engine is POI, but POI can easily cause memory overflow when reading Excel with more than 65,000 rows, so you can switch to EasyExcel as the reading engine.
251+
252+
242253
### xml_row_tag [string]
243254

244255
Only need to be configured when file_format is xml.

seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/DateTimeUtils.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,18 @@ public class DateTimeUtils {
5656
FORMATTER_MAP.put(
5757
Formatter.YYYY_MM_DD_HH_MM_SS_SLASH,
5858
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SLASH.value));
59+
FORMATTER_MAP.put(
60+
Formatter.YYYY_M_D_HH_MM_SS_SLASH,
61+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_SLASH.value));
62+
FORMATTER_MAP.put(
63+
Formatter.YYYY_M_D_HH_MM_SS_ISO8601,
64+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_ISO8601.value));
65+
FORMATTER_MAP.put(
66+
Formatter.YYYY_M_D_HH_MM_SLASH,
67+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SLASH.value));
68+
FORMATTER_MAP.put(
69+
Formatter.YYYY_M_D_HH_MM_ISO8601,
70+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_ISO8601.value));
5971
FORMATTER_MAP.put(
6072
Formatter.YYYY_MM_DD_HH_MM_SS_NO_SPLIT,
6173
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_NO_SPLIT.value));
@@ -73,9 +85,26 @@ public class DateTimeUtils {
7385
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SSSSSSSSS_ISO8601.value));
7486
}
7587

88+
// if the datatime string length is 17, find the DateTimeFormatter from this map
89+
public static final Map<Pattern, DateTimeFormatter> YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP =
90+
new LinkedHashMap<>();
91+
92+
// if the datatime string length is 15, find the DateTimeFormatter from this map
93+
public static final Map<Pattern, DateTimeFormatter> YYYY_M_D_HH_MM_15_FORMATTER_MAP =
94+
new LinkedHashMap<>();
95+
96+
// all Pattern in this set
97+
public static Set<Map.Entry<Pattern, DateTimeFormatter>>
98+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();
99+
100+
// all Pattern in this set
101+
public static Set<Map.Entry<Pattern, DateTimeFormatter>>
102+
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();
103+
76104
// if the datatime string length is 19, find the DateTimeFormatter from this map
77105
public static final Map<Pattern, DateTimeFormatter> YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP =
78106
new LinkedHashMap<>();
107+
79108
public static Set<Map.Entry<Pattern, DateTimeFormatter>>
80109
YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP_ENTRY_SET = new LinkedHashSet<>();
81110

@@ -115,6 +144,22 @@ public class DateTimeUtils {
115144
Pattern.compile("\\d{4}/\\d{2}/\\d{2}\\s\\d{2}:\\d{2}:\\d{2}"),
116145
DateTimeFormatter.ofPattern(Formatter.YYYY_MM_DD_HH_MM_SS_SLASH.value));
117146

147+
YYYY_M_D_HH_MM_15_FORMATTER_MAP.put(
148+
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{2}:\\d{2}"),
149+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SLASH.value));
150+
151+
YYYY_M_D_HH_MM_15_FORMATTER_MAP.put(
152+
Pattern.compile("\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{2}:\\d{2}"),
153+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_ISO8601.value));
154+
155+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.put(
156+
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}"),
157+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_SLASH.value));
158+
159+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.put(
160+
Pattern.compile("\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}"),
161+
DateTimeFormatter.ofPattern(Formatter.YYYY_M_D_HH_MM_SS_ISO8601.value));
162+
118163
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP.put(
119164
Pattern.compile("\\d{4}/\\d{2}/\\d{2}\\s\\d{2}:\\d{2}.*"),
120165
new DateTimeFormatterBuilder()
@@ -159,6 +204,12 @@ public class DateTimeUtils {
159204
YYYY_MM_DD_HH_MM_SS_19_FORMATTER_MAP.entrySet());
160205
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP_ENTRY_SET.addAll(
161206
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP.entrySet());
207+
208+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET.addAll(
209+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP.entrySet());
210+
211+
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET.addAll(
212+
YYYY_M_D_HH_MM_15_FORMATTER_MAP.entrySet());
162213
}
163214

164215
/**
@@ -176,14 +227,40 @@ public static DateTimeFormatter matchDateTimeFormatter(String dateTime) {
176227
return entry.getValue();
177228
}
178229
}
230+
for (Map.Entry<Pattern, DateTimeFormatter> entry :
231+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET) {
232+
if (entry.getKey().matcher(dateTime).matches()) {
233+
return entry.getValue();
234+
}
235+
}
179236
} else if (dateTime.length() > 19) {
180237
for (Map.Entry<Pattern, DateTimeFormatter> entry :
181238
YYYY_MM_DD_HH_MM_SS_M19_FORMATTER_MAP_ENTRY_SET) {
182239
if (entry.getKey().matcher(dateTime).matches()) {
183240
return entry.getValue();
184241
}
185242
}
243+
} else if (dateTime.length() == 17 || dateTime.length() == 18) {
244+
for (Map.Entry<Pattern, DateTimeFormatter> entry :
245+
YYYY_M_D_HH_MM_SS_17_FORMATTER_MAP_ENTRY_SET) {
246+
if (entry.getKey().matcher(dateTime).matches()) {
247+
return entry.getValue();
248+
}
249+
}
250+
} else if (dateTime.length() == 15 || dateTime.length() == 16) {
251+
for (Map.Entry<Pattern, DateTimeFormatter> entry :
252+
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET) {
253+
if (entry.getKey().matcher(dateTime).matches()) {
254+
return entry.getValue();
255+
}
256+
}
186257
} else if (dateTime.length() == 14) {
258+
for (Map.Entry<Pattern, DateTimeFormatter> entry :
259+
YYYY_M_D_HH_MM_15_FORMATTER_MAP_ENTRY_SET) {
260+
if (entry.getKey().matcher(dateTime).matches()) {
261+
return entry.getValue();
262+
}
263+
}
187264
return YYYY_MM_DD_HH_MM_SS_14_FORMATTER;
188265
}
189266
return null;
@@ -247,6 +324,10 @@ public enum Formatter {
247324
YYYY_MM_DD_HH_MM_SS_SSSSSS("yyyy-MM-dd HH:mm:ss.SSSSSS"),
248325
YYYY_MM_DD_HH_MM_SS_SPOT("yyyy.MM.dd HH:mm:ss"),
249326
YYYY_MM_DD_HH_MM_SS_SLASH("yyyy/MM/dd HH:mm:ss"),
327+
YYYY_M_D_HH_MM_SLASH("yyyy/M/d HH:mm"),
328+
YYYY_M_D_HH_MM_ISO8601("yyyy-M-d HH:mm"),
329+
YYYY_M_D_HH_MM_SS_SLASH("yyyy/M/d HH:mm:ss"),
330+
YYYY_M_D_HH_MM_SS_ISO8601("yyyy-M-d HH:mm:ss"),
250331
YYYY_MM_DD_HH_MM_SS_NO_SPLIT("yyyyMMddHHmmss"),
251332
YYYY_MM_DD_HH_MM_SS_ISO8601("yyyy-MM-dd'T'HH:mm:ss"),
252333
YYYY_MM_DD_HH_MM_SS_SSS_ISO8601("yyyy-MM-dd'T'HH:mm:ss.SSS"),

seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/DateUtils.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ public class DateUtils {
6060
Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?Z"),
6161
Pattern.compile("\\d{2}:\\d{2}:\\d{2}\\+\\d{2}:\\d{2}"),
6262
Pattern.compile("\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?"),
63+
Pattern.compile("\\d{4}/\\d{1,2}/\\d{1,2}")
6364
};
6465

6566
public static final Map<Pattern, DateTimeFormatter> DATE_FORMATTER_MAP = new HashMap();
@@ -147,6 +148,12 @@ public class DateUtils {
147148
.toFormatter());
148149
DATE_FORMATTER_MAP.put(PATTERN_ARRAY[6], ISO_OFFSET_TIME);
149150
DATE_FORMATTER_MAP.put(PATTERN_ARRAY[7], ISO_LOCAL_TIME);
151+
DATE_FORMATTER_MAP.put(
152+
PATTERN_ARRAY[8],
153+
new DateTimeFormatterBuilder()
154+
.parseCaseInsensitive()
155+
.append(DateTimeFormatter.ofPattern("yyyy/M/d"))
156+
.toFormatter());
150157
}
151158

152159
/**
@@ -184,6 +191,7 @@ public static String toString(LocalDate date, Formatter formatter) {
184191

185192
public enum Formatter {
186193
YYYY_MM_DD("yyyy-MM-dd"),
194+
YYYY_M_D("yyyy/M/d"),
187195
YYYY_MM_DD_SPOT("yyyy.MM.dd"),
188196
YYYY_MM_DD_SLASH("yyyy/MM/dd");
189197
private final String value;

seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/TimeUtils.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.time.format.DateTimeFormatter;
2222
import java.util.HashMap;
2323
import java.util.Map;
24+
import java.util.regex.Pattern;
2425

2526
public class TimeUtils {
2627
private static final Map<Formatter, DateTimeFormatter> FORMATTER_MAP =
@@ -37,6 +38,29 @@ public static LocalTime parse(String time, Formatter formatter) {
3738
return LocalTime.parse(time, FORMATTER_MAP.get(formatter));
3839
}
3940

41+
public static final Pattern[] PATTERN_ARRAY =
42+
new Pattern[] {
43+
Pattern.compile("\\d{2}:\\d{2}:\\d{2}"),
44+
Pattern.compile("\\d{2}:\\d{2}:\\d{2}.\\d{3}"),
45+
};
46+
47+
public static Formatter matchTimeFormatter(String dateTime) {
48+
for (int j = 0; j < PATTERN_ARRAY.length; j++) {
49+
if (PATTERN_ARRAY[j].matcher(dateTime).matches()) {
50+
Formatter dateTimeFormatter = Time_FORMATTER_MAP.get(PATTERN_ARRAY[j]);
51+
return dateTimeFormatter;
52+
}
53+
}
54+
return null;
55+
}
56+
57+
public static final Map<Pattern, Formatter> Time_FORMATTER_MAP = new HashMap();
58+
59+
static {
60+
Time_FORMATTER_MAP.put(PATTERN_ARRAY[0], Formatter.parse(Formatter.HH_MM_SS.value));
61+
Time_FORMATTER_MAP.put(PATTERN_ARRAY[1], Formatter.parse(Formatter.HH_MM_SS_SSS.value));
62+
}
63+
4064
public static String toString(LocalTime time, Formatter formatter) {
4165
return time.format(FORMATTER_MAP.get(formatter));
4266
}

seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/DateTimeUtilsTest.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ public void testAutoDateTimeFormatter() {
6767
datetimeStr = "2020/10/10 10:10:10";
6868
Assertions.assertEquals("2020-10-10T10:10:10", DateTimeUtils.parse(datetimeStr).toString());
6969

70+
datetimeStr = "2020/1/1 10:10";
71+
Assertions.assertEquals("2020-01-01T10:10", DateTimeUtils.parse(datetimeStr).toString());
72+
73+
datetimeStr = "2024/12/2 10:10";
74+
Assertions.assertEquals("2024-12-02T10:10", DateTimeUtils.parse(datetimeStr).toString());
75+
76+
datetimeStr = "2024/12/1 10:10";
77+
Assertions.assertEquals("2024-12-01T10:10", DateTimeUtils.parse(datetimeStr).toString());
78+
7079
datetimeStr = "2020年10月10日 10时10分10秒";
7180
Assertions.assertEquals("2020-10-10T10:10:10", DateTimeUtils.parse(datetimeStr).toString());
7281

seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/DateUtilsTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,17 @@ public void testMatchDateTimeFormatter() {
6666
Assertions.assertEquals(
6767
"2020-10-10",
6868
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
69+
datetimeStr = "2024/1/1";
70+
Assertions.assertEquals(
71+
"2024-01-01",
72+
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
73+
datetimeStr = "2024/10/1";
74+
Assertions.assertEquals(
75+
"2024-10-01",
76+
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
77+
datetimeStr = "2024/1/10";
78+
Assertions.assertEquals(
79+
"2024-01-10",
80+
DateUtils.parse(datetimeStr, DateUtils.matchDateFormatter(datetimeStr)).toString());
6981
}
7082
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.seatunnel.common.utils;
19+
20+
import org.junit.jupiter.api.Assertions;
21+
import org.junit.jupiter.api.Test;
22+
23+
public class TimeUtilsTest {
24+
@Test
25+
public void testMatchTimeFormatter() {
26+
String timeStr = "12:12:12";
27+
Assertions.assertEquals(
28+
"12:12:12",
29+
TimeUtils.parse(timeStr, TimeUtils.matchTimeFormatter(timeStr)).toString());
30+
31+
timeStr = "12:12:12.123";
32+
Assertions.assertEquals(
33+
"12:12:12.123",
34+
TimeUtils.parse(timeStr, TimeUtils.matchTimeFormatter(timeStr)).toString());
35+
}
36+
}

seatunnel-connectors-v2/connector-file/connector-file-base/pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
<hadoop-minikdc.version>3.1.4</hadoop-minikdc.version>
4040
<dom4j.version>2.1.4</dom4j.version>
4141
<jaxen.version>2.0.0</jaxen.version>
42+
<easyexcel.version>4.0.3</easyexcel.version>
43+
<fastexcel-reader.version>0.18.4</fastexcel-reader.version>
4244
</properties>
4345

4446
<dependencyManagement>
@@ -158,6 +160,13 @@
158160
<artifactId>jaxen</artifactId>
159161
<version>${jaxen.version}</version>
160162
</dependency>
163+
164+
<dependency>
165+
<groupId>com.alibaba</groupId>
166+
<artifactId>easyexcel</artifactId>
167+
<version>${easyexcel.version}</version>
168+
</dependency>
169+
161170
</dependencies>
162171

163172
<build>

seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfigOptions.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,12 @@ public class BaseSourceConfigOptions {
140140
.noDefaultValue()
141141
.withDescription("To be read sheet name,only valid for excel files");
142142

143+
public static final Option<ExcelEngine> EXCEL_ENGINE =
144+
Options.key("excel_engine")
145+
.enumType(ExcelEngine.class)
146+
.defaultValue(ExcelEngine.POI)
147+
.withDescription("To switch excel read engine, e.g. POI , EasyExcel");
148+
143149
public static final Option<String> XML_ROW_TAG =
144150
Options.key("xml_row_tag")
145151
.stringType()

0 commit comments

Comments
 (0)