In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pcsv
import pyarrow.dataset as ds
from pathlib import Path

In [7]:
DATAROOT = Path.home() / "temp/criteo-kaggle-small/data"
DATAROOT

PosixPath('/Users/avilay/temp/criteo-kaggle-small/data')

In [4]:
intcols = [f"i{i}" for i in range(1, 14)]
strcols = [f"s{i}" for i in range(1, 27)]
colnames = ["label"] + intcols + strcols
colnames

['label',
 'i1',
 'i2',
 'i3',
 'i4',
 'i5',
 'i6',
 'i7',
 'i8',
 'i9',
 'i10',
 'i11',
 'i12',
 'i13',
 's1',
 's2',
 's3',
 's4',
 's5',
 's6',
 's7',
 's8',
 's9',
 's10',
 's11',
 's12',
 's13',
 's14',
 's15',
 's16',
 's17',
 's18',
 's19',
 's20',
 's21',
 's22',
 's23',
 's24',
 's25',
 's26']

In [5]:
fileformat = ds.CsvFileFormat(
    parse_options=pcsv.ParseOptions(delimiter="\t"),
    read_options=pcsv.ReadOptions(column_names=colnames)
)
fileformat

<CsvFileFormat parse_options=<pyarrow._csv.ParseOptions object at 0x1279bd980>>

In [8]:
src = DATAROOT / "tsv"
src

PosixPath('/Users/avilay/temp/criteo-kaggle-small/data/tsv')

In [9]:
labelschema = [("label", pa.int8())]
intschema = [(colname, pa.int32()) for colname in intcols]
strschema = [(colname, pa.string()) for colname in strcols]

schema = pa.schema(labelschema + intschema + strschema)
schema

label: int8
i1: int32
i2: int32
i3: int32
i4: int32
i5: int32
i6: int32
i7: int32
i8: int32
i9: int32
i10: int32
i11: int32
i12: int32
i13: int32
s1: string
s2: string
s3: string
s4: string
s5: string
s6: string
s7: string
s8: string
s9: string
s10: string
s11: string
s12: string
s13: string
s14: string
s15: string
s16: string
s17: string
s18: string
s19: string
s20: string
s21: string
s22: string
s23: string
s24: string
s25: string
s26: string

In [10]:
criteo = ds.dataset(source=src, schema=schema, format=fileformat)

In [12]:
dst = DATAROOT / "parquet"
dst

PosixPath('/Users/avilay/temp/criteo-kaggle-small/data/parquet')

In [13]:
wo = ds.ParquetFileFormat().make_write_options(compression="gzip")
wo

<pyarrow._dataset_parquet.ParquetFileWriteOptions at 0x106c4da80>

In [14]:
ds.write_dataset(criteo, dst, format="parquet", file_options=wo)