Skip to content

Commit

Permalink
fix(spark): Fix the LdbcSample2GraphAr result and use csv as output (#…
Browse files Browse the repository at this point in the history
…528)



Signed-off-by: acezen <qiaozi.zwb@alibaba-inc.com>
  • Loading branch information
acezen committed Jun 20, 2024
1 parent 8d5111f commit d6ce836
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,23 +92,14 @@ object LdbcSample2GraphAr {
writer.PutVertexData("Person", person_df)

// read edges with type "Person"->"Knows"->"Person" from given path as a DataFrame
// FIXME(@acezen): the schema should be inferred from the data, but graphar spark
// library does not support timestamp type yet
val schema = StructType(
Array(
StructField("src", IntegerType, true),
StructField("dst", IntegerType, true),
StructField("creationDate", StringType, true)
)
)
val produced_edge_df = spark.read
val knows_edge_df = spark.read
.option("delimiter", "|")
.option("header", "true")
.schema(schema)
.option("inferSchema", "true")
.format("csv")
.load(personKnowsPersonInputPath)
// put into writer, source vertex label is "Person", edge label is "Knows"
// target vertex label is "Person"
writer.PutEdgeData(("Person", "Knows", "Person"), produced_edge_df)
writer.PutEdgeData(("Person", "Knows", "Person"), knows_edge_df)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ object Utils {
def sparkDataType2GraphArTypeName(dataType: DataType): String = {
val typeName = dataType.typeName
val grapharTypeName = typeName match {
case "string" => "string"
case "integer" => "int"
case "long" => "int64"
case "double" => "double"
case "boolean" => "bool"
case "string" => "string"
case "integer" => "int"
case "long" => "int64"
case "double" => "double"
case "boolean" => "bool"
case "timestamp" => "timestamp"
case _ =>
throw new IllegalArgumentException(
"Expected string, integral, double or boolean type, got " + typeName + " type"
Expand Down
2 changes: 1 addition & 1 deletion maven-projects/spark/scripts/run-ldbc-sample2graphar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ output_dir="/tmp/graphar/ldbc_sample"

vertex_chunk_size=100
edge_chunk_size=1024
file_type="parquet"
file_type="csv"
spark-submit --class org.apache.graphar.example.LdbcSample2GraphAr ${jar_file} \
${person_input_file} ${person_knows_person_input_file} ${output_dir} ${vertex_chunk_size} ${edge_chunk_size} ${file_type}

0 comments on commit d6ce836

Please sign in to comment.