Skip to content

Commit

Permalink
ARROW-7775: [Rust] fix: Don't let safe code arbitrarily transmute rea…
Browse files Browse the repository at this point in the history
…ders and writers

This code were quite clearly unsafe (which seems to be known, judging
from the comment about it). As the functions appeared to callable
indirectly from other public functions I changed these to just panic
instead to keep the API.

Closes #6256 from Marwes/unsafe_rust and squashes the following commits:

97e022a <Markus Westerlind> s/typ/type/g
9fe6ddf <Markus Westerlind> Remove unnecessary Send/Sync impls for RecordBatch
d484e17 <Markus Westerlind> Better error message for typed conversion
66ce5b1 <Markus Westerlind> fix: Don't let safe code arbitrarily transmute readers and writers

Authored-by: Markus Westerlind <markus.westerlind@distilnetworks.com>
Signed-off-by: Chao Sun <sunchao@apache.org>
  • Loading branch information
Markus Westerlind authored and sunchao committed Feb 15, 2020
1 parent 5411915 commit f41b863
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 61 deletions.
5 changes: 1 addition & 4 deletions rust/arrow/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ impl RecordBatch {
}
if columns[i].data_type() != schema.field(i).data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"column types must match schema types, expected {:?} but found {:?} at column index {}",
"column types must match schema types, expected {:?} but found {:?} at column index {}",
schema.field(i).data_type(),
columns[i].data_type(),
i)));
Expand Down Expand Up @@ -130,9 +130,6 @@ impl Into<StructArray> for RecordBatch {
}
}

unsafe impl Send for RecordBatch {}
unsafe impl Sync for RecordBatch {}

/// Definition of record batch reader.
pub trait RecordBatchReader {
/// Returns schemas of this record batch reader.
Expand Down
20 changes: 7 additions & 13 deletions rust/parquet/src/column/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
use std::{
cmp::{max, min},
collections::HashMap,
mem,
};

use super::page::{Page, PageReader};
Expand Down Expand Up @@ -90,21 +89,16 @@ pub fn get_column_reader(
/// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of
/// non-generic type to a generic column reader type `ColumnReaderImpl`.
///
/// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches
/// the type `T`. Otherwise, disastrous consequence could happen.
/// Panics if actual enum value for `col_reader` does not match the type `T`.
pub fn get_typed_column_reader<T: DataType>(
col_reader: ColumnReader,
) -> ColumnReaderImpl<T> {
match col_reader {
ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int64ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int96ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::FloatColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::DoubleColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::ByteArrayColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::FixedLenByteArrayColumnReader(r) => unsafe { mem::transmute(r) },
}
T::get_column_reader(col_reader).unwrap_or_else(|| {
panic!(
"Failed to convert column reader into a typed column reader for `{}` type",
T::get_physical_type()
)
})
}

/// Typed value reader for a particular primitive column.
Expand Down
57 changes: 20 additions & 37 deletions rust/parquet/src/column/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

//! Contains column writer API.

use std::{cmp, collections::VecDeque, mem, rc::Rc};
use std::{cmp, collections::VecDeque, rc::Rc};

use crate::basic::{Compression, Encoding, PageType, Type};
use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter};
Expand Down Expand Up @@ -98,57 +98,40 @@ pub fn get_column_writer(
/// Gets a typed column writer for the specific type `T`, by "up-casting" `col_writer` of
/// non-generic type to a generic column writer type `ColumnWriterImpl`.
///
/// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches
/// the type `T`. Otherwise, disastrous consequence could happen.
/// Panics if actual enum value for `col_writer` does not match the type `T`.
pub fn get_typed_column_writer<T: DataType>(
col_writer: ColumnWriter,
) -> ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(r) => unsafe { mem::transmute(r) },
}
T::get_column_writer(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Similar to `get_typed_column_writer` but returns a reference.
pub fn get_typed_column_writer_ref<T: DataType>(
col_writer: &ColumnWriter,
) -> &ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(ref r) => unsafe {
mem::transmute(r)
},
}
T::get_column_writer_ref(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Similar to `get_typed_column_writer` but returns a reference.
pub fn get_typed_column_writer_mut<T: DataType>(
col_writer: &mut ColumnWriter,
) -> &mut ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(ref mut r) => unsafe {
mem::transmute(r)
},
}
T::get_column_writer_mut(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Typed column writer for a primitive column.
Expand Down
118 changes: 111 additions & 7 deletions rust/parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ use std::mem;
use byteorder::{BigEndian, ByteOrder};

use crate::basic::Type;
use crate::column::reader::{ColumnReader, ColumnReaderImpl};
use crate::column::writer::{ColumnWriter, ColumnWriterImpl};
use crate::errors::{ParquetError, Result};
use crate::util::memory::{ByteBuffer, ByteBufferPtr};
use std::str::from_utf8;
Expand Down Expand Up @@ -376,10 +378,30 @@ pub trait DataType: 'static {

/// Returns size in bytes for Rust representation of the physical type.
fn get_type_size() -> usize;

fn get_column_reader(column_writer: ColumnReader) -> Option<ColumnReaderImpl<Self>>
where
Self: Sized;

fn get_column_writer(column_writer: ColumnWriter) -> Option<ColumnWriterImpl<Self>>
where
Self: Sized;

fn get_column_writer_ref(
column_writer: &ColumnWriter,
) -> Option<&ColumnWriterImpl<Self>>
where
Self: Sized;

fn get_column_writer_mut(
column_writer: &mut ColumnWriter,
) -> Option<&mut ColumnWriterImpl<Self>>
where
Self: Sized;
}

macro_rules! make_type {
($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => {
($name:ident, $physical_ty:path, $reader_ident: ident, $writer_ident: ident, $native_ty:ty, $size:expr) => {
pub struct $name {}

impl DataType for $name {
Expand All @@ -392,27 +414,109 @@ macro_rules! make_type {
fn get_type_size() -> usize {
$size
}

fn get_column_reader(
column_writer: ColumnReader,
) -> Option<ColumnReaderImpl<Self>> {
match column_writer {
ColumnReader::$reader_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer(
column_writer: ColumnWriter,
) -> Option<ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer_ref(
column_writer: &ColumnWriter,
) -> Option<&ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer_mut(
column_writer: &mut ColumnWriter,
) -> Option<&mut ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}
}
};
}

// Generate struct definitions for all physical types

make_type!(BoolType, Type::BOOLEAN, bool, 1);
make_type!(Int32Type, Type::INT32, i32, 4);
make_type!(Int64Type, Type::INT64, i64, 8);
make_type!(Int96Type, Type::INT96, Int96, mem::size_of::<Int96>());
make_type!(FloatType, Type::FLOAT, f32, 4);
make_type!(DoubleType, Type::DOUBLE, f64, 8);
make_type!(
BoolType,
Type::BOOLEAN,
BoolColumnReader,
BoolColumnWriter,
bool,
1
);
make_type!(
Int32Type,
Type::INT32,
Int32ColumnReader,
Int32ColumnWriter,
i32,
4
);
make_type!(
Int64Type,
Type::INT64,
Int64ColumnReader,
Int64ColumnWriter,
i64,
8
);
make_type!(
Int96Type,
Type::INT96,
Int96ColumnReader,
Int96ColumnWriter,
Int96,
mem::size_of::<Int96>()
);
make_type!(
FloatType,
Type::FLOAT,
FloatColumnReader,
FloatColumnWriter,
f32,
4
);
make_type!(
DoubleType,
Type::DOUBLE,
DoubleColumnReader,
DoubleColumnWriter,
f64,
8
);
make_type!(
ByteArrayType,
Type::BYTE_ARRAY,
ByteArrayColumnReader,
ByteArrayColumnWriter,
ByteArray,
mem::size_of::<ByteArray>()
);
make_type!(
FixedLenByteArrayType,
Type::FIXED_LEN_BYTE_ARRAY,
FixedLenByteArrayColumnReader,
FixedLenByteArrayColumnWriter,
ByteArray,
mem::size_of::<ByteArray>()
);
Expand Down

0 comments on commit f41b863

Please sign in to comment.