In [1]:
import duckdb

In [2]:
conn = duckdb.connect()

In [3]:
!du -sh /data/reddit-climate/*.csv

3.9G	/data/reddit-climate/the-reddit-climate-change-dataset-comments.csv
250M	/data/reddit-climate/the-reddit-climate-change-dataset-posts.csv


In [4]:
!head /data/reddit-climate/the-reddit-climate-change-dataset-comments.csv

type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
comment,imlddn9,2qh3l,news,false,1661990368,https://old.reddit.com/r/news/comments/x2cszk/us_life_expectancy_down_for_secondstraight_year/imlddn9/,"Yeah but what the above commenter is saying is their base doesn’t want any of that. They detest all of those things, even the small gradual changes. Investing in nuclear energy is a tacit acknowledgement of man made climate change. Any acknowledgement or concession and they will be primaried out in a minute",0.5719,2
comment,imldbeh,2qn7b,ohio,false,1661990340,https://old.reddit.com/r/Ohio/comments/x2awnp/state_government_may_soon_kill_a_solar_project_in/imldbeh/,"Any comparison of efficiency between solar and fossil fuels is nonsensical at best and intentionally misleading at worst. In no universe is light -&gt; photovoltaic cell -&gt; electricity less efficient than light -&gt; entire food chain -&gt; biomass -&gt; decomposition -&gt; millions of

In [5]:
conn.query("""
SELECT COUNT(*)
FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.csv'
""").fetchall()

[(4600698,)]

In [6]:
import os

import pyarrow as pa
from pyarrow.csv import open_csv, ParseOptions
import pyarrow.parquet as pq

csv_files = !ls /data/reddit-climate/*.csv
csv_files

for filename in csv_files:
    print(f"Reading {filename}...")
    mmap = pa.memory_map(filename)

    reader = open_csv(mmap, parse_options=ParseOptions(newlines_in_values=True))

    destination_file = os.path.splitext(filename)[0] + ".parquet"
    if os.path.isfile(destination_file):
        continue

    with pq.ParquetWriter(destination_file, reader.schema) as writer:
        while True:
            try:
                batch = reader.read_next_batch()
                writer.write_batch(batch)
            except StopIteration:
                break

Reading /data/reddit-climate/the-reddit-climate-change-dataset-comments.csv...
Reading /data/reddit-climate/the-reddit-climate-change-dataset-posts.csv...


In [7]:
!du -sh /data/reddit-climate/*.parquet

2.3G	/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet
97M	/data/reddit-climate/the-reddit-climate-change-dataset-posts.parquet


In [8]:
conn.execute("""
SELECT COUNT(*)
FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet'
""").fetchall()

[(4600698,)]

---

In [9]:
conn.execute("""
CREATE VIEW comments AS
SELECT * FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet'
""")

<duckdb.DuckDBPyConnection at 0x7ff9250a0a70>

In [10]:
conn.execute("""
SELECT
  "subreddit.name" AS subreddit_name,
  COUNT(*) AS num_comments,
FROM comments
GROUP BY subreddit_name
ORDER BY num_comments DESC
LIMIT 10
""").fetchall()

[('politics', 370018),
 ('worldnews', 351195),
 ('askreddit', 259848),
 ('collapse', 94696),
 ('news', 94558),
 ('futurology', 89945),
 ('science', 71453),
 ('environment', 70444),
 ('canada', 66813),
 ('australia', 60239)]

In [11]:
conn.execute("""
SELECT
  "subreddit.name" AS subreddit_name,
  COUNT(*) AS num_comments,
  AVG(sentiment) AS average_sentiment,
  STDDEV(sentiment) AS stddev_sentiment,
FROM comments
WHERE subreddit_name IN (
  SELECT "subreddit.name" AS subreddit_name
  FROM comments
  GROUP BY subreddit_name
  ORDER BY COUNT(*) DESC
  LIMIT 10
)
GROUP BY subreddit_name
ORDER BY num_comments DESC
""").fetchall()

[('politics', 370018, -0.018118589649651677, 0.6600297061407984),
 ('worldnews', 351195, -0.058001587387908116, 0.6405990095462727),
 ('askreddit', 259848, -0.06863721863923525, 0.608974871810146),
 ('collapse', 94696, -0.1332661626390421, 0.6667106776062661),
 ('news', 94558, -0.09367126059175679, 0.6276134461239277),
 ('futurology', 89945, 0.001863748911563026, 0.6506820198836208),
 ('science', 71453, 0.04588216852922981, 0.6248484283076317),
 ('environment', 70444, -0.015670189810190065, 0.6467846578160436),
 ('canada', 66813, 0.021118244331091524, 0.6408319443539501),
 ('australia', 60239, -0.021869519296547842, 0.6405803819103513)]

In [12]:
%load_ext sql

In [13]:
%sql duckdb:///:memory:

In [14]:
%sql CREATE VIEW comments AS SELECT * FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet'

 * duckdb:///:memory:
Done.


Count


In [15]:
%sql SELECT COUNT(*) FROM comments

 * duckdb:///:memory:
Done.


count_star()
4600698


---

In [16]:
rel = conn.query("""
SELECT
  "subreddit.name" AS subreddit_name,
  COUNT(*) AS num_comments,
FROM comments
GROUP BY subreddit_name
ORDER BY num_comments DESC
LIMIT 10
""")

In [17]:
type(rel)

duckdb.DuckDBPyRelation

In [18]:
rel

---------------------
--- Relation Tree ---
---------------------
Subquery

---------------------
-- Result Columns  --
---------------------
- subreddit_name (VARCHAR)
- num_comments (BIGINT)

---------------------
-- Result Preview  --
---------------------
subreddit_name	num_comments	
VARCHAR	BIGINT	
[ Rows: 10]
politics	370018
worldnews	351195
askreddit	259848
collapse	94696
news	94558
futurology	89945
science	71453
environment	70444
canada	66813
australia	60239



In [19]:
rel.df()  # pandas

Unnamed: 0,subreddit_name,num_comments
0,politics,370018
1,worldnews,351195
2,askreddit,259848
3,collapse,94696
4,news,94558
5,futurology,89945
6,science,71453
7,environment,70444
8,canada,66813
9,australia,60239


In [20]:
import polars as pl

In [21]:
data = rel.arrow()  # Arrow data

In [22]:
pl.DataFrame(data)  # Polars

subreddit_name,num_comments
str,i64
"""politics""",370018
"""worldnews""",351195
"""askreddit""",259848
"""collapse""",94696
"""news""",94558
"""futurology""",89945
"""science""",71453
"""environment""",70444
"""canada""",66813
"""australia""",60239


In [23]:
df_most_comments = rel.df()

In [24]:
df_most_comments.head()  # pandas

Unnamed: 0,subreddit_name,num_comments
0,politics,370018
1,worldnews,351195
2,askreddit,259848
3,collapse,94696
4,news,94558


In [25]:
conn.execute("""
SELECT subreddit_name
FROM df_most_comments  -- Sorcery!
LIMIT 5
""").fetchall()

[('politics',), ('worldnews',), ('askreddit',), ('collapse',), ('news',)]

In [26]:
conn.register("most_comments_arrow", data)

<duckdb.DuckDBPyConnection at 0x7f55342cb670>

In [27]:
conn.execute("""
SELECT subreddit_name
FROM most_comments_arrow
LIMIT 5
""").fetchall()

[('politics',), ('worldnews',), ('askreddit',), ('collapse',), ('news',)]

In [28]:
%sql output << SELECT subreddit_name FROM df_most_comments LIMIT 5

 * duckdb:///:memory:
Done.
Returning data to local variable output


In [29]:
output.DataFrame()  # pandas

Unnamed: 0,subreddit_name
0,politics
1,worldnews
2,askreddit
3,collapse
4,news


---

In [30]:
rel.filter("num_comments > 100000").order("subreddit_name").df()

Unnamed: 0,subreddit_name,num_comments
0,askreddit,259848
1,politics,370018
2,worldnews,351195


In [31]:
!./duckdb -c ' \
SELECT \
  "subreddit.name" AS subreddit_name, \
  COUNT(*) AS num_comments \
FROM "/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet" \
GROUP BY subreddit_name \
LIMIT 10 \
'

┌────────────────────┬──────────────┐
│   subreddit_name   │ num_comments │
├────────────────────┼──────────────┤
│ christianity       │ 9746         │
│ technology         │ 24145        │
│ videos             │ 26813        │
│ truscum            │ 93           │
│ gardening          │ 1390         │
│ conspiracy_commons │ 2252         │
│ askreddit          │ 259848       │
│ interestingasfuck  │ 17063        │
│ gamingcirclejerk   │ 2719         │
│ funnymemes         │ 271          │
└────────────────────┴──────────────┘
