You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
use std::{fs::File, path::Path, sync::Arc};use parquet::{
basic::Compression,
data_type::{ByteArray,ByteArrayType,Int32Type},
file::{
properties::{WriterProperties,WriterVersion},
reader::FileReader,
serialized_reader::SerializedFileReader,
writer::SerializedFileWriter,},
record::{Row,RowAccessor},
schema::parser::parse_message_type,};constMESSAGE_TYPE:&'static str = "message Log { OPTIONAL INT32 eventType; REPEATED BYTE_ARRAY category;}";pubstructItem{pubevent_type:i32,pubcategory:Vec<String>,}pubstructBatch{pubevent_types:Vec<i32>,pubcategories:Vec<ByteArray>,}fndata() -> Batch{let items = vec![Item{
event_type: 1,
category: vec!["test11".to_string(), "test12".to_string()],
},
Item{
event_type: 2,
category: vec!["test21".to_string(), "test22".to_string()],
},
];letmut b = Batch{event_types:vec![],categories:vec![],};for item in&items {
b.event_types.push(item.event_type);for cate in&item.category{
b.categories.push(ByteArray::from(cate.as_str()));}}
b
}fnwrite(){let path = Path::new("sample.parquet");let file = File::create(&path).unwrap();let schema = Arc::new(parse_message_type(MESSAGE_TYPE).unwrap());let props = Arc::new(WriterProperties::builder().set_compression(Compression::SNAPPY).set_writer_version(WriterVersion::PARQUET_2_0).build(),);letmut writer = SerializedFileWriter::new(file, schema, props).unwrap();letmut row_group_writer = writer.next_row_group().unwrap();let batch = data();// column 0letmut col_writer = row_group_writer
.next_column().expect("next column").unwrap();
col_writer
.typed::<Int32Type>().write_batch(&batch.event_types,None,None).expect("writing column");
col_writer.close().expect("close column");// question1 column 1 how write REPEATED?letmut col_writer = row_group_writer
.next_column().expect("next column").unwrap();
col_writer
.typed::<ByteArrayType>().write_batch(&batch.categories,None,None).expect("writing column");
col_writer.close().expect("close column");let rg_md = row_group_writer.close().expect("close row group");println!("total rows written: {}", rg_md.num_rows());
writer.close().unwrap();}fnread(){let path = Path::new("sample.parquet");let file = File::open(path).expect("Unable to open file");let reader = SerializedFileReader::new(file).expect("Unable to read file");let iter = reader.get_row_iter(None).expect("get iterator");for record in iter {let event_type = record.get_int(0).unwrap();read_category(&record,1);println!("event_type{}", event_type);}}// public static List<String> getCategory(Group value) {// List<String> categoryList = new ArrayList<>();// try {// int count = value.getFieldRepetitionCount("category");// if (count > 0) {// int index = 0;// while (index < count) {// categoryList.add(value.getString("category", index++).trim());// }// }// } catch (Exception e) {// }// return categoryList;// }fnread_category(record:&Row,i:usize){// question2 where is getFieldRepetitionCount, how to read REPEATED?match record.get_bytes(i){Ok(v) => println!("{:?}", v.as_utf8()),Err(_) => {}};}
Describe your question
how read/write category using parquet
question1 column 1 how write REPEATED?
question2 where is getFieldRepetitionCount, how to read REPEATED?
Additional context
Add any other context about the problem here.
The text was updated successfully, but these errors were encountered:
Hi, I'm not very familiar with parquet-mr which your example appears to be based on, nor am I hugely knowledgeable about the record APIs for reading parquet, but I'll try to help out here 😅
FWIW I would strongly encourage you to consider trying out the arrow interface, it should be faster, better tested and better documented than the record APIs which are somewhat orphaned at the moment...
Which part is this question about
Describe your question
how read/write category using parquet
question1 column 1 how write REPEATED?
question2 where is getFieldRepetitionCount, how to read REPEATED?
Additional context
Add any other context about the problem here.
The text was updated successfully, but these errors were encountered: