In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

In [31]:
d = {
    "a": "aa",
    "b": 123,
    "c": 45.6,
    "d": datetime.strptime(d["d"], "%Y-%m-%d")
}

In [43]:
pat = pa.Table.from_pylist([d])
pat

pyarrow.Table
a: string
b: int64
c: double
d: timestamp[us]
----
a: [["aa"]]
b: [[123]]
c: [[45.6]]
d: [[2023-03-01 00:00:00.000000]]

In [33]:
pq.write_table(pat, 'example.parquet')

In [36]:
df = pq.read_table('example.parquet').to_pandas()
df

Unnamed: 0,a,b,c,d
0,aa,123,45.6,2023-03-01


In [37]:
df.dtypes

a            object
b             int64
c           float64
d    datetime64[ns]
dtype: object

In [52]:
item = {
    "title": "spam",
    "isbn13": 1234,
    "what": "nope"
}
schema = pa.schema([
    ("what", pa.string()),
    ("title", pa.string()),
    ("isbn13", pa.int64())
])

# print(set(item.keys()).symmetric_difference(set(schema.names)))
assert len(set(item.keys()).symmetric_difference(set(schema.names))) == 0

from toolz import apply
diff_func = lambda x, y: len(x.symmetric_difference(y))
diff = diff_func(*map(set, [item.keys(), schema.names]))
assert diff == 0

In [50]:
from toolz import apply
diff_func = lambda x, y: len(x.symmetric_difference(y))
diff = diff_func(*map(set, [item.keys(), schema.names]))
assert diff == 0

In [45]:
list(w)

[<function __main__.<lambda>(x)>, <map at 0x7f9c4c761890>]

In [54]:
import random
random.sample(a, 1)

['w']

In [5]:
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

In [7]:
schema = pa.schema([
    ("title", pa.string()),
    ("isbn13", pa.int64()),
    ("author", pa.string()),
    ("date_pub", pa.string()),
    ("mean_rating", pa.float64()),
    ("num_rating", pa.int64()),
    ("user_rating", pa.int64()),
    ("date_read", pa.string()),
    ("date_added", pa.string()),
    ("review_text", pa.string()),
    ("shelf_url", pa.string()),
])
df = pq.read_table('../data', schema=schema).to_pandas()
# df.dtypes
print(len(df))
df.head()

220


Unnamed: 0,title,isbn13,author,date_pub,mean_rating,num_rating,user_rating,date_read,date_added,review_text,shelf_url
0,"Scales of Gold (The House of Niccolo, #4)",9780395000000.0,"Dunnett, Dorothy",1991,4.47,2287,5,"Jul 09, 2015","July 09, 2015",,https://www.goodreads.com/review/list/40648422...
1,The Lymond Poetry,9780141000000.0,"Dunnett, Dorothy","Jan 01, 2003",4.41,73,5,,"July 07, 2015",,https://www.goodreads.com/review/list/40648422...
2,The White Boar,,"Palmer, Marian",1968,3.73,99,5,"Jul 02, 2015","June 28, 2015","""The Cat, the Rat, and Lovell our Dog",https://www.goodreads.com/review/list/40648422...
3,"The Daughter of Time (Inspector Alan Grant, #5)",,"Tey, Josephine",1951,3.91,30976,5,,"June 28, 2015",I'll admit it - after reading Sharon Kay Penma...,https://www.goodreads.com/review/list/40648422...
4,The Traitor's Wife: A Novel of the Reign of Ed...,9781583000000.0,"Higginbotham, Susan","Jul 25, 2005",3.76,3649,4,"Jan 14, 2018","June 11, 2015",This is a time in the history of England that ...,https://www.goodreads.com/review/list/40648422...


In [98]:
df = pq.read_table('../data').to_pandas()
print(len(df))
df["review_text"][:20]

220


0                                                  None
1     This review is for the Fionavar Tapestry trilo...
2                                                  None
3                                                  None
4                                                  None
5     I read this for a book club. And I am still pr...
6                                                  None
7                                                  None
8                                                  None
9                                                  None
10                                                 None
11                                                 None
12    I enjoyed this story very much. It's a period ...
13                                                 None
14    My friend who is a librarian suggested that I ...
15    I love Stella Riley. Her books are amazing. Wh...
16                                                 None
17                                              

In [22]:
schema = pa.schema([
    ("title", pa.string()),
    ("isbn13", pa.int32()),
    ("author", pa.string()),
    ("date_pub", pa.string()),
    ("mean_rating", pa.float32()),
    ("num_rating", pa.int32()),
    ("user_rating", pa.int32()),
    ("date_read", pa.string()),
    ("date_added", pa.string()),
    ("review_text", pa.string()),
    ("shelf_url", pa.string()),
])
df = pq.read_table('../data/00001.parquet')
df.schema

title: string
isbn13: int64
author: string
date_pub: string
mean_rating: double
num_rating: int64
user_rating: int64
date_read: string
date_added: string
review_text: null
shelf_url: string

In [18]:
type(df.schema)

pyarrow.lib.Schema

In [19]:
schema = pa.schema([
    ("title": pa.string()),
    ("isbn13": pa.int32()),
    ("author": pa.string()),
    ("date_pub": pa.string()),
    ("mean_rating": pa.float()),
    ("num_rating": pa.int32()),
    ("user_rating": pa.int32()),
    ("date_read": pa.string()),
    ("date_added": pa.string()),
    ("review_text": pa.string()),
    ("shelf_url": pa.string()),
])

some_int: int32
some_string: string

In [77]:
def convert_rating(rating: str):
    rating_map = {
        "x": 0,
        "y": 1,
    }
    for k, v in rating_map.items():
        if rating == k:
            return v

In [80]:
a = [
    {"a": "x", "b": 10},
    {"a": "y", "b": 11},
    {"a": "x", "b": 12},
]


In [81]:
%%timeit
for d in a:
    d["a"] = convert_rating(d["a"])

809 ns ± 71.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [85]:
def convert_rating(d: dict):
    rating_map = {
        "x": 0,
        "y": 1,
    }
    for k, v in rating_map.items():
        if d["a"] == k:
            d["a"] =  v
    return d

In [90]:
a = [
    {"a": "x", "b": 10},
    {"a": "y", "b": 11},
    {"a": "x", "b": 12},
]
# a = list(map(convert_rating, a))
# a

In [92]:
%%timeit
list(map(convert_rating, a))

994 ns ± 41 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
