In [27]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
from pyarrow import csv
import pyarrow.dataset as ds
from pathlib import Path

In [3]:
days = pa.array([1, 12, 17, 23, 28], type=pa.int8())
months = pa.array([1, 3, 4, 7, 1], type=pa.int8())
years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16())

In [4]:
bdays = pa.table([days, months, years], names=["day", "month", "year"])
bdays

pyarrow.Table
day: int8
month: int8
year: int16
----
day: [[1,12,17,23,28]]
month: [[1,3,4,7,1]]
year: [[1990,2000,1995,2000,1995]]

In [5]:
bdays[0]

<pyarrow.lib.ChunkedArray object at 0x128e619e0>
[
  [
    1,
    12,
    17,
    23,
    28
  ]
]

In [6]:
bdays[1]

<pyarrow.lib.ChunkedArray object at 0x128e63a10>
[
  [
    1,
    3,
    4,
    7,
    1
  ]
]

In [7]:
bdays[2]

<pyarrow.lib.ChunkedArray object at 0x128e63790>
[
  [
    1990,
    2000,
    1995,
    2000,
    1995
  ]
]

Even though the array is chunked, `bdays[0][0]` will get me the first element after all the chunked elements have been flattened.

In [14]:
bdays[0][0]

<pyarrow.Int8Scalar: 1>

In [13]:
dir(bdays[0][0])

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'as_py',
 'cast',
 'equals',
 'is_valid',
 'type']

In [26]:
def row(table, i):
    return tuple(table[j][i].as_py() for j in range(table.num_columns))

In [27]:
for i in range(bdays.num_rows):
    print(row(bdays, i))

(1, 1, 1990)
(12, 3, 2000)
(17, 4, 1995)
(23, 7, 2000)
(28, 1, 1995)


In [30]:
pq.write_table(bdays, "bdays.parquet")

In [31]:
bdays2 = pq.read_table("bdays.parquet")

In [32]:
bdays2

pyarrow.Table
day: int8
month: int8
year: int16
----
day: [[1,12,17,23,28]]
month: [[1,3,4,7,1]]
year: [[1990,2000,1995,2000,1995]]

In [37]:
year_vals = pc.value_counts(bdays["year"])
year_vals

<pyarrow.lib.StructArray object at 0x138d75420>
-- is_valid: all not null
-- child 0 type: int16
  [
    1990,
    2000,
    1995
  ]
-- child 1 type: int64
  [
    1,
    2,
    2
  ]

In [38]:
dir(year_vals)

['__array__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_debug_print',
 '_export_to_c',
 '_import_from_c',
 '_name',
 '_to_pandas',
 'buffers',
 'cast',
 'dictionary_encode',
 'diff',
 'drop_null',
 'equals',
 'field',
 'fill_null',
 'filter',
 'flatten',
 'format',
 'from_arrays',
 'from_buffers',
 'from_pandas',
 'get_total_buffer_size',
 'index',
 'is_null',
 'is_valid',
 'nbytes',
 'null_count',
 'offset',
 'slice',
 'sum',
 'take',
 'to_numpy',
 'to_pandas',
 'to_pylist',
 'to_string',
 'tolist',
 'type',
 'unique',
 'validate',
 'value_counts',
 'view']

In [40]:
year_vals.to_pandas()

0    {'values': 1990, 'counts': 1}
1    {'values': 2000, 'counts': 2}
2    {'values': 1995, 'counts': 2}
dtype: object

In [41]:
dir(bdays)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_column',
 '_ensure_integer_index',
 '_to_pandas',
 'add_column',
 'append_column',
 'cast',
 'column',
 'column_names',
 'columns',
 'combine_chunks',
 'drop',
 'drop_null',
 'equals',
 'field',
 'filter',
 'flatten',
 'from_arrays',
 'from_batches',
 'from_pandas',
 'from_pydict',
 'from_pylist',
 'get_total_buffer_size',
 'group_by',
 'itercolumns',
 'join',
 'nbytes',
 'num_columns',
 'num_rows',
 'remove_column',
 'rename_columns',
 'replace_schema_metadata',
 'schema',
 'select',
 'set_column',
 'shape',
 'slice',
 'sort_by',
 'take',
 'to_batches',
 'to_pandas',
 'to_pydict',
 'to_pylist',
 'to

In [42]:
bdays.flatten()

pyarrow.Table
day: int8
month: int8
year: int16
----
day: [[1,12,17,23,28]]
month: [[1,3,4,7,1]]
year: [[1990,2000,1995,2000,1995]]

In [3]:
filepath = "/Users/avilay/temp/kaggle-criteo-small/train_small_00.tsv"
criteo = csv.read_csv(
    filepath, 
    read_options=csv.ReadOptions(autogenerate_column_names=True),
    parse_options=csv.ParseOptions(delimiter="\t")
)

In [8]:
len(criteo)

1000000

In [11]:
criteo[23][0]

<pyarrow.StringScalar: 'ce8e716d'>

In [14]:
filepath = "/Users/avilay/temp/kaggle-criteo-small/train_small_00.parquet.gz"
pq.write_table(criteo, filepath, compression="gzip")

In [15]:
criteo

pyarrow.Table
f0: int64
f1: int64
f2: int64
f3: int64
f4: int64
f5: int64
f6: int64
f7: int64
f8: int64
f9: int64
f10: int64
f11: int64
f12: int64
f13: int64
f14: string
f15: string
f16: string
f17: string
f18: string
f19: string
f20: string
f21: string
f22: string
f23: string
f24: string
f25: string
f26: string
f27: string
f28: string
f29: string
f30: string
f31: string
f32: string
f33: string
f34: string
f35: string
f36: string
f37: string
f38: string
f39: string
----
f0: [[0,1,0,0,0,...,0,1,0,0,0],[0,0,1,0,1,...,0,0,1,0,0],...,[0,0,0,1,0,...,0,0,0,0,1],[0,0,1,0,0,...,0,0,0,0,0]]
f1: [[null,1,null,null,2,...,14,0,1,0,null],[null,4,0,null,9,...,null,null,1,0,null],...,[3,null,null,null,2,...,2,null,7,null,null],[null,null,0,15,24,...,0,null,null,0,null]]
f2: [[253,35,3,0,0,...,1,574,0,45,36],[1,11,5,1,1,...,366,3,6,-1,1],...,[-1,1,40,2,0,...,12,1,1393,-1,12],[0,1818,99,306,303,...,0,290,0,27,0]]
f3: [[34,6,25,2,2,...,1,18,6,1,215],[1,9,null,2,82,...,1,3,17,14,null],...,[null,2,null,nu

In [20]:
intcols = [f"i{i}" for i in range(1, 14)]
strcols = [f"s{i}" for i in range(1, 27)]
colnames = ["label"] + intcols + strcols
colnames

['label',
 'i1',
 'i2',
 'i3',
 'i4',
 'i5',
 'i6',
 'i7',
 'i8',
 'i9',
 'i10',
 'i11',
 'i12',
 'i13',
 's1',
 's2',
 's3',
 's4',
 's5',
 's6',
 's7',
 's8',
 's9',
 's10',
 's11',
 's12',
 's13',
 's14',
 's15',
 's16',
 's17',
 's18',
 's19',
 's20',
 's21',
 's22',
 's23',
 's24',
 's25',
 's26']

In [24]:
labelschema = [("label", pa.int8())]
intschema = [(colname, pa.int32()) for colname in intcols]
strschema = [(colname, pa.string()) for colname in strcols]

schema = pa.schema(labelschema + intschema + strschema)
schema

label: int8
i1: int32
i2: int32
i3: int32
i4: int32
i5: int32
i6: int32
i7: int32
i8: int32
i9: int32
i10: int32
i11: int32
i12: int32
i13: int32
s1: string
s2: string
s3: string
s4: string
s5: string
s6: string
s7: string
s8: string
s9: string
s10: string
s11: string
s12: string
s13: string
s14: string
s15: string
s16: string
s17: string
s18: string
s19: string
s20: string
s21: string
s22: string
s23: string
s24: string
s25: string
s26: string

In [26]:
fileformat = ds.CsvFileFormat(
    parse_options=csv.ParseOptions(delimiter="\t"),
    read_options=csv.ReadOptions(column_names=colnames)
)
fileformat

<CsvFileFormat parse_options=<pyarrow._csv.ParseOptions object at 0x16a9e0980>>

In [49]:
src = Path.home() / Path("temp/criteo-kaggle-small/data/tsv")
src

PosixPath('/Users/avilay/temp/criteo-kaggle-small/data/tsv')

In [50]:
criteo = ds.dataset(source=src, schema=schema, format=fileformat)

In [51]:
criteo.files

['/Users/avilay/temp/criteo-kaggle-small/data/tsv/train_small_00.tsv',
 '/Users/avilay/temp/criteo-kaggle-small/data/tsv/train_small_01.tsv',
 '/Users/avilay/temp/criteo-kaggle-small/data/tsv/train_small_02.tsv',
 '/Users/avilay/temp/criteo-kaggle-small/data/tsv/train_small_03.tsv']

In [37]:
# Reads all the files in memory
criteo_tbl = criteo.to_table()

In [34]:
len(criteo_tbl)

4000000

In [35]:
len(criteo)

TypeError: object of type 'pyarrow._dataset.FileSystemDataset' has no len()

In [36]:
criteo_tbl

pyarrow.Table
label: int8
i1: int32
i2: int32
i3: int32
i4: int32
i5: int32
i6: int32
i7: int32
i8: int32
i9: int32
i10: int32
i11: int32
i12: int32
i13: int32
s1: string
s2: string
s3: string
s4: string
s5: string
s6: string
s7: string
s8: string
s9: string
s10: string
s11: string
s12: string
s13: string
s14: string
s15: string
s16: string
s17: string
s18: string
s19: string
s20: string
s21: string
s22: string
s23: string
s24: string
s25: string
s26: string
----
label: [[0,1,0,0,0,...,0,1,0,0,0],[0,0,1,0,1,...,0,0,1,0,0],...,[0,1,0,0,0,...,0,0,0,1,0],[1,1,1,0,0,...,0,0,0,0,0]]
i1: [[null,1,null,null,2,...,14,0,1,0,null],[null,4,0,null,9,...,null,null,1,0,null],...,[0,0,null,null,null,...,null,null,null,null,null],[null,null,1,2,1,...,0,null,null,0,10]]
i2: [[253,35,3,0,0,...,1,574,0,45,36],[1,11,5,1,1,...,366,3,6,-1,1],...,[0,1,0,0,1,...,7,3,2,1,0],[2590,0,-1,24,13,...,107,1,39,46,507]]
i3: [[34,6,25,2,2,...,1,18,6,1,215],[1,9,null,2,82,...,1,3,17,14,null],...,[66,16,2,23,1,...,2,1,10

In [39]:
# Can also read in batches. Default batch_size is suppoed to be 1_000_000
# but clearly this is not the case.
batch_sizes = []
for batch in criteo.to_batches():
    batch_sizes.append(batch.num_rows)

In [42]:
len(batch_sizes)

928

In [43]:
sum(batch_sizes)

4000000

In [44]:
batch_sizes = [batch.num_rows for batch in criteo.to_batches(batch_size=1000)]


In [45]:
len(batch_sizes)

4636

In [46]:
sum(batch_sizes)

4000000

In [47]:
batch_sizes

[1000,
 1000,
 1000,
 1000,
 321,
 1000,
 1000,
 1000,
 1000,
 308,
 1000,
 1000,
 1000,
 1000,
 316,
 1000,
 1000,
 1000,
 1000,
 311,
 1000,
 1000,
 1000,
 1000,
 316,
 1000,
 1000,
 1000,
 1000,
 314,
 1000,
 1000,
 1000,
 1000,
 313,
 1000,
 1000,
 1000,
 1000,
 313,
 1000,
 1000,
 1000,
 1000,
 313,
 1000,
 1000,
 1000,
 1000,
 314,
 1000,
 1000,
 1000,
 1000,
 303,
 1000,
 1000,
 1000,
 1000,
 306,
 1000,
 1000,
 1000,
 1000,
 316,
 1000,
 1000,
 1000,
 1000,
 308,
 1000,
 1000,
 1000,
 1000,
 311,
 1000,
 1000,
 1000,
 1000,
 306,
 1000,
 1000,
 1000,
 1000,
 303,
 1000,
 1000,
 1000,
 1000,
 305,
 1000,
 1000,
 1000,
 1000,
 304,
 1000,
 1000,
 1000,
 1000,
 317,
 1000,
 1000,
 1000,
 1000,
 315,
 1000,
 1000,
 1000,
 1000,
 308,
 1000,
 1000,
 1000,
 1000,
 312,
 1000,
 1000,
 1000,
 1000,
 310,
 1000,
 1000,
 1000,
 1000,
 315,
 1000,
 1000,
 1000,
 1000,
 312,
 1000,
 1000,
 1000,
 1000,
 312,
 1000,
 1000,
 1000,
 1000,
 316,
 1000,
 1000,
 1000,
 1000,
 315,
 1000,
 1000,


In [52]:
type(criteo)

pyarrow._dataset.FileSystemDataset

In [53]:
dst = Path.home() / Path("temp/criteo-kaggle-small/data/parquet")
dst

PosixPath('/Users/avilay/temp/criteo-kaggle-small/data/parquet')

In [54]:
ds.write_dataset(criteo, dst, format="parquet", compression="gzip")

TypeError: write_dataset() got an unexpected keyword argument 'compression'

In [55]:
dir(ds.FileWriteOptions)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'format']

In [65]:
# This does not seem to work
ff = ds.ParquetFileFormat()
wo = ff.make_write_options(compression="gzip")
wo

<pyarrow._dataset_parquet.ParquetFileWriteOptions at 0x173bf60e0>

In [67]:
ds.write_dataset(criteo, dst, format="parquet", file_options=wo)

In [60]:
train_small_size = 928
train_small_00_size = 232
train_small_00_pq_size = 65
train_small_00_gz_size = 47
train_small_pq_size = 376

In [61]:
train_small_00_pq_size / train_small_00_size

0.2801724137931034

In [63]:
train_small_00_gz_size / train_small_00_size

0.2025862068965517

In [64]:
train_small_pq_size / train_small_size

0.4051724137931034