In [1]:
import duckdb

In [2]:
conn = duckdb.connect()

In [3]:
!du -sh /data/reddit-climate/*.csv

3.9G	/data/reddit-climate/the-reddit-climate-change-dataset-comments.csv
250M	/data/reddit-climate/the-reddit-climate-change-dataset-posts.csv


In [4]:
!head /data/reddit-climate/the-reddit-climate-change-dataset-comments.csv

type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
comment,imlddn9,2qh3l,news,false,1661990368,https://old.reddit.com/r/news/comments/x2cszk/us_life_expectancy_down_for_secondstraight_year/imlddn9/,"Yeah but what the above commenter is saying is their base doesn’t want any of that. They detest all of those things, even the small gradual changes. Investing in nuclear energy is a tacit acknowledgement of man made climate change. Any acknowledgement or concession and they will be primaried out in a minute",0.5719,2
comment,imldbeh,2qn7b,ohio,false,1661990340,https://old.reddit.com/r/Ohio/comments/x2awnp/state_government_may_soon_kill_a_solar_project_in/imldbeh/,"Any comparison of efficiency between solar and fossil fuels is nonsensical at best and intentionally misleading at worst. In no universe is light -&gt; photovoltaic cell -&gt; electricity less efficient than light -&gt; entire food chain -&gt; biomass -&gt; decomposition -&gt; millions of

In [5]:
conn.query("""
SELECT COUNT(*)
FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.csv'
""").fetchall()

[(4600698,)]

In [6]:
import os

import pyarrow as pa
from pyarrow.csv import open_csv, ParseOptions
import pyarrow.parquet as pq

csv_files = !ls /data/reddit-climate/*.csv
csv_files

for filename in csv_files:
    print(f"Reading {filename}...")
    mmap = pa.memory_map(filename)

    reader = open_csv(mmap, parse_options=ParseOptions(newlines_in_values=True))

    destination_file = os.path.splitext(filename)[0] + ".parquet"
    if os.path.isfile(destination_file):
        continue

    with pq.ParquetWriter(destination_file, reader.schema) as writer:
        while True:
            try:
                batch = reader.read_next_batch()
                writer.write_batch(batch)
            except StopIteration:
                break

Reading /data/reddit-climate/the-reddit-climate-change-dataset-comments.csv...
Reading /data/reddit-climate/the-reddit-climate-change-dataset-posts.csv...


In [7]:
!du -sh /data/reddit-climate/*.parquet

2.3G	/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet
97M	/data/reddit-climate/the-reddit-climate-change-dataset-posts.parquet


In [8]:
conn.execute("""
SELECT COUNT(*)
FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet'
""").fetchall()

[(4600698,)]

---

In [9]:
conn.execute("""
CREATE VIEW comments AS
SELECT * FROM '/data/reddit-climate/the-reddit-climate-change-dataset-comments.parquet'
""").fetchall()

[]

In [10]:
conn.execute("""
SELECT
  "subreddit.name" AS subreddit_name,
  COUNT(*) AS num_comments,
FROM comments
GROUP BY subreddit_name
ORDER BY num_comments DESC
LIMIT 10
""").fetchall()

[('politics', 370018),
 ('worldnews', 351195),
 ('askreddit', 259848),
 ('collapse', 94696),
 ('news', 94558),
 ('futurology', 89945),
 ('science', 71453),
 ('environment', 70444),
 ('canada', 66813),
 ('australia', 60239)]

In [11]:
conn.execute("""
SELECT
  "subreddit.name" AS subreddit_name,
  COUNT(*) AS num_comments,
  AVG(sentiment) AS average_sentiment,
  STDDEV(sentiment) AS stddev_sentiment,
FROM comments
WHERE subreddit_name IN (
  SELECT "subreddit.name" AS subreddit_name
  FROM comments
  GROUP BY subreddit_name
  ORDER BY COUNT(*) DESC
  LIMIT 10
)
GROUP BY subreddit_name
ORDER BY num_comments DESC
""").fetchall()

[('politics', 370018, -0.018118589649651688, 0.6600297061408),
 ('worldnews', 351195, -0.05800158738790874, 0.6405990095462698),
 ('askreddit', 259848, -0.06863721863923522, 0.6089748718101494),
 ('collapse', 94696, -0.13326616263904179, 0.6667106776062686),
 ('news', 94558, -0.09367126059175668, 0.6276134461239254),
 ('futurology', 89945, 0.00186374891156306, 0.6506820198836242),
 ('science', 71453, 0.04588216852922977, 0.6248484283076321),
 ('environment', 70444, -0.015670189810189975, 0.6467846578160411),
 ('canada', 66813, 0.021118244331091468, 0.6408319443539481),
 ('australia', 60239, -0.021869519296548054, 0.6405803819103516)]