In [9]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [53]:
N = 10_000_000

df = pd.DataFrame(
    np.random.choice([True, False], size=(N, 6)), 
    columns=list('ABCDEF'))
df

Unnamed: 0,A,B,C,D,E,F
0,False,False,False,True,False,False
1,False,True,True,True,True,False
2,False,False,True,False,False,True
3,False,False,False,True,True,False
4,True,True,False,True,False,True
...,...,...,...,...,...,...
9999995,False,False,False,True,False,True
9999996,False,True,False,False,True,True
9999997,False,True,True,False,False,True
9999998,True,True,False,False,True,False


In [3]:
ALPHABET = np.array(list("ATGC"), dtype=(np.str_, 1))
def rands_array(nchars, size, dtype="O"):
    """Generate an array of byte strings."""
    retval = (
        np.random.choice(ALPHABET, size=nchars * np.prod(size))
        .view((np.str_, nchars))
        .reshape(size)
    )
    if dtype is None:
        return retval
    else:
        return retval.astype(dtype)

df["motif"] = pd.Series(rands_array(nchars=8, size=N))
df

Unnamed: 0,A,B,C,D,E,F,motif
0,True,True,False,False,False,True,TTAAGAGC
1,True,False,False,True,True,True,CCCGATTC
2,True,True,False,True,True,True,ACAAACTG
3,True,True,False,False,False,False,TTCCTCTT
4,False,False,True,False,False,False,CCGATTCC
...,...,...,...,...,...,...,...
9999995,True,False,False,True,False,False,CAGTTCAG
9999996,False,True,True,False,False,True,ACTCACTG
9999997,False,False,True,False,True,True,TTGTAGAT
9999998,True,True,True,True,True,True,TGCACGCT


In [4]:
df["group"] = df.motif.apply(lambda x: ''.join(sorted(x)))
df

Unnamed: 0,A,B,C,D,E,F,motif,group
0,True,True,False,False,False,True,TTAAGAGC,AAACGGTT
1,True,False,False,True,True,True,CCCGATTC,ACCCCGTT
2,True,True,False,True,True,True,ACAAACTG,AAAACCGT
3,True,True,False,False,False,False,TTCCTCTT,CCCTTTTT
4,False,False,True,False,False,False,CCGATTCC,ACCCCGTT
...,...,...,...,...,...,...,...,...
9999995,True,False,False,True,False,False,CAGTTCAG,AACCGGTT
9999996,False,True,True,False,False,True,ACTCACTG,AACCCGTT
9999997,False,False,True,False,True,True,TTGTAGAT,AAGGTTTT
9999998,True,True,True,True,True,True,TGCACGCT,ACCCGGTT


In [8]:
table = pa.Table.from_pandas(df)
table

pyarrow.Table
A: bool
B: bool
C: bool
D: bool
E: bool
F: bool
motif: string
group: string

In [26]:
np.random.choice(list(range(6)), size=N)

array([3, 2, 4, ..., 0, 3, 0])

In [39]:
thresholds = pa.UInt8Array.from_pandas(
    pd.Series(
        np.random.choice(list(range(6)), size=N)
    )
)

In [47]:
thresholds

<pyarrow.lib.Int64Array object at 0x7f6dc7060160>
[
  5,
  0,
  3,
  0,
  1,
  1,
  4,
  3,
  5,
  5,
  ...
  5,
  0,
  4,
  2,
  4,
  5,
  0,
  2,
  5,
  4
]

In [28]:
motifs = pa.StringArray.from_pandas(pd.Series(rands_array(nchars=8, size=N)))

In [54]:
table = pa.Table.from_pandas(df)
table

pyarrow.Table
A: bool
B: bool
C: bool
D: bool
E: bool
F: bool

In [59]:
use_dict = False
pyarrow_compression = "snappy"

In [60]:
pq.write_table(table, f'pa_only_bools_binary_{N}_dict{use_dict}_compr{pyarrow_compression}_pyarrow.parquet', 
               use_dictionary=use_dict,
            compression=pyarrow_compression)

In [5]:
for c in ["snappy", "gzip", "brotli"]:
    df.to_parquet(f"output_binary_10_000_000_{c}.parquet", compression=c)