In [1]:
import pyarrow as pa
import numpy as np
import pandas as pd

In [2]:
pa.__version__

'13.0.0'

In [2]:
arr = pa.array([1,2,3,4])

### Create a Record Batch

In [3]:
NROWS = 5
NCOLS = 3
data = [pa.array(np.random.randn(NROWS)) for i in range(NCOLS)]
cols = ['c' + str(i) for i in range(NCOLS)]
rb = pa.RecordBatch.from_arrays(data, cols)
print("Schema:")
print(rb.schema)
print("Number of rows:", rb.num_rows)
rb

Schema:
c0: double
c1: double
c2: double
Number of rows: 5


pyarrow.RecordBatch
c0: double
c1: double
c2: double
----
c0: [-0.8563064329314133,-0.41848941306191845,-0.2938948616854316,0.7191179100660653,1.6326755634821601]
c1: [1.1776877910333934,-0.2911940098038575,0.24063503807159045,-1.5131669281641935,0.4043618336674261]
c2: [0.0060882069584162896,1.0682068256979511,-1.7676270681387618,1.9072153643808951,-1.2221326893076547]

In [4]:
archer_list = [{
    'archer': 'Legolas',
    'location': 'Mirkwood',
    'year': 1954
},{
    'archer': 'Oliver',
    'location': 'Star City',
    'year': 1941
},{
    'archer': 'Merida',
    'location': 'Scotland',
    'year': 2012
},{
    'archer': 'Lara',
    'location': 'London',
    'year': 1996
},{
    'archer': 'Artemis',
    'location': 'Greece',
    'year': -600
}]

In [5]:
archer_type = pa.struct([('archer', pa.utf8()), ('location', pa.utf8()), ('year', pa.int16())])

In [6]:
archers = pa.array(archer_list, type=archer_type)
print(archers.type)

struct<archer: string, location: string, year: int16>


### Create record batch - zero copy manipulation

In [7]:
rb = pa.RecordBatch.from_arrays(archers.flatten(), ['archer', 'location', 'year'])
print("number of rows: ", rb.num_rows)
print("number of cols: ", rb.num_columns)
rb.to_pandas()

number of rows:  5
number of cols:  3


Unnamed: 0,archer,location,year
0,Legolas,Mirkwood,1954
1,Oliver,Star City,1941
2,Merida,Scotland,2012
3,Lara,London,1996
4,Artemis,Greece,-600


### Slicing a Record Batch

In [48]:
# slicing a Record Batch
rb_slice = rb.slice(1,3)
print(rb_slice.column(2))
rb_slice.to_pandas()

[
  1941,
  2012,
  1996
]


Unnamed: 0,archer,location,year
0,Oliver,Star City,1941
1,Merida,Scotland,2012
2,Lara,London,1996


Take a (row-wise) list of objects with the following structure and convert them to a column-oriented record batch:
{ id: int, cost: double, cost_components: list<double> }

An example might be { "id": 4, "cost": 241.21, "cost_ components": [ 100.00, 140.10, 1.11] } for a single object.

In [8]:
components_list = [{
    "id": 4,
    "cost": 241.21,
    "cost_components": [100.0, 140.10, 1.11]
},
{
    "id": 5,
    "cost": 251.21,
    "cost_components": [200.0, 240.10, 2.11]
}]

In [9]:
components_type = pa.struct([("id", pa.int16()), ("cost", pa.float64()), ("cost_components", pa.list_(pa.float64()))])

In [10]:
components_array = pa.array(components_list, type=components_type)
rb = pa.RecordBatch.from_arrays(components_array.flatten(), ["id", "cost", "cost_components"])

In [11]:
rb.to_pandas()

Unnamed: 0,id,cost,cost_components
0,4,241.21,"[100.0, 140.1, 1.11]"
1,5,251.21,"[200.0, 240.1, 2.11]"


In [16]:
rb.column_names

['id', 'cost', 'cost_components']

In [29]:
dict = rb.to_pydict()
dict

{'id': [4, 5],
 'cost': [241.21, 251.21],
 'cost_components': [[100.0, 140.1, 1.11], [200.0, 240.1, 2.11]]}