In [1]:
from pyiceberg.catalog import load_catalog
import ibis

In [35]:
df = ibis.read_parquet('../data/parquet/steam_reviews.parquet')
arrow_table = df.to_pyarrow()

In [54]:
catalog = load_catalog('reviews', uri='http://localhost:19120/iceberg', 
                       **{"s3.endpoint": "http://localhost:9000",
                         "s3.access-key-id": "LGMGAGLrVCiiEEo4p0LM",
                         "s3.secret-access-key": "LGMGAGLrVCiiEEo4p0LM",})

In [43]:
catalog.drop_table('reviews.reviews')

In [44]:
catalog.drop_namespace('reviews')

In [45]:
catalog.list_namespaces()

[]

In [46]:
catalog.create_namespace('reviews')

In [47]:
catalog.list_namespaces()

[('reviews',)]

In [48]:
catalog.create_table('reviews.reviews', schema=arrow_table.schema)

reviews(
  1: recommendationid: optional long,
  2: language: optional string,
  3: timestamp_created: optional timestamptz,
  4: timestamp_updated: optional timestamptz,
  5: voted_up: optional boolean,
  6: votes_up: optional long,
  7: votes_funny: optional long,
  8: weighted_vote_score: optional double,
  9: comment_count: optional long,
  10: steam_purchase: optional boolean,
  11: received_for_free: optional long,
  12: written_during_early_access: optional long,
  13: hidden_in_steam_china: optional long,
  14: steam_china_location: optional string,
  15: author_steamid: optional long,
  16: author_num_games_owned: optional long,
  17: author_num_reviews: optional long,
  18: author_playtime_forever: optional long,
  19: author_playtime_last_two_weeks: optional long,
  20: author_playtime_at_review: optional long,
  21: author_last_played: optional timestamptz,
  22: year_created: optional long,
  23: month_created: optional long,
  24: game_id: optional string
),
partition by:

In [55]:
table = catalog.load_table('reviews.reviews')

In [56]:
table.overwrite(arrow_table)

In [65]:
conn = table.scan(selected_fields=('language', 'game_id', 'voted_up')).to_duckdb('reviews')

In [82]:
r = conn.sql("""with lang_reviews as (
    SELECT language, game_id, count(*) as num_reviews 
    FROM reviews group by all
), max_reviews as (
    select 
    language, 
    game_id, 
    num_reviews,
    RANK() OVER (partition by language order by num_reviews desc) as ordering 
    from lang_reviews
)
select language, game_id, num_reviews from max_reviews
where ordering = 1
order by num_reviews desc
    
""")

In [83]:
r.show()

┌────────────┬─────────┬─────────────┐
│  language  │ game_id │ num_reviews │
│  varchar   │ varchar │    int64    │
├────────────┼─────────┼─────────────┤
│ english    │ 730     │     2102886 │
│ russian    │ 730     │     2006616 │
│ schinese   │ 578080  │     1166691 │
│ brazilian  │ 730     │      435306 │
│ polish     │ 730     │      417014 │
│ turkish    │ 730     │      389142 │
│ spanish    │ 730     │      283405 │
│ german     │ 730     │      205558 │
│ french     │ 730     │      123453 │
│ koreana    │ 578080  │      119080 │
│  ·         │  ·      │         ·   │
│  ·         │  ·      │         ·   │
│  ·         │  ·      │         ·   │
│ thai       │ 730     │       22326 │
│ latam      │ 730     │       19880 │
│ italian    │ 730     │       19369 │
│ dutch      │ 730     │       17458 │
│ norwegian  │ 730     │       14019 │
│ vietnamese │ 730     │       10445 │
│ bulgarian  │ 730     │        9670 │
│ japanese   │ 1172470 │        8976 │
│ greek      │ 730     │ 