In [1]:
%load_ext lab_black

In [2]:
import warnings
import pathlib
import newspaper

from psycopg2 import connect
from urllib3.exceptions import LocationParseError
from multiprocessing import Pool
from math import floor, ceil
from IPython.display import clear_output

In [3]:
def article_content_fetcher(row):
    path = pathlib.Path("../data/raw")
    path.mkdir(parents=True, exist_ok=True)
    _id, url, date, day = row

    text_dump = pathlib.Path(path / f"{_id}.text")
    title_dump = pathlib.Path(path / f"{_id}.title")

    if not text_dump.is_file():
        try:
            article = newspaper.Article(url=url)
            article.download()
            article.parse()

            article_text = article.text
            article_title = article.title

            article_text = article_text if article_text else ""
            article_title = article_title if article_title else ""

        except (TypeError, newspaper.ArticleException, LocationParseError):
            # TypeError will occur when there is no url
            # ArticleException will occur when the response forbids webscraping
            # not sure what causes LocationParseError, but it's rare
            article_text = ""
            article_title = ""

        with text_dump.open(mode="w+") as txt, title_dump.open(mode="w+") as ttl:
            txt.write(article_text)
            ttl.write(article_title)

    return

In [4]:
def row_writer(row, conn):
    _id, url, date, day = row
    path = pathlib.Path("../data/raw/")

    text_dump = pathlib.Path(path / f"{_id}.text")
    title_dump = pathlib.Path(path / f"{_id}.title")

    cursor = conn.cursor()

    with text_dump.open(mode="r") as txt, title_dump.open(mode="r") as ttl:
        title = title_dump.read_text()
        text = text_dump.read_text()
        cmd = f"INSERT INTO source_text(id, source, date, day, title, text) VALUES ({_id}, {url}, {date}, {day}, {title if title else None}, {text if text else None})"
        cursor.execute(cmd)

    conn.commit()

In [5]:
SQL_RANDOM_SEED = 0.42
ENTRIES_PER_WEEK = 100

In [6]:
cmd = f"""
      DROP TABLE IF EXISTS source_text;
      CREATE TABLE source_text(id integer PRIMARY KEY, source text, date timestamp, day text, title text, text text);
      SELECT SETSEED ({SQL_RANDOM_SEED});
      WITH grouped_by_week AS (
          SELECT
              DISTINCT ON (SOURCEURL) SOURCEURL AS source,
              id,
              TO_DATE(Day::text, 'YYYYMMDD') AS date,
              Day as day,
              RANK() OVER (PARTITION BY date_trunc('week', TO_DATE(Day::text, 'YYYYMMDD')) ORDER BY RANDOM()) as row
          FROM events
      )
      SELECT
          id,
          source,
          date,
          day
      FROM grouped_by_week
      WHERE row <= {ENTRIES_PER_WEEK};
      """

In [7]:
pool = Pool()
with open("../etc/postgres.password") as psql_pass_file:
    postgres_password = psql_pass_file.read()
    conn = connect(
        f"host='localhost' dbname='gdelt' user='postgres' password='{postgres_password}'"
    )
    cursor = conn.cursor()
    cursor.execute(cmd)
    conn.commit()
    rows = cursor.fetchall()
    print("Source article download progress 0% complete")
    for percent in range(100):
        # prints progress as percents
        bot = floor(percent * len(rows) / 100.0)
        top = ceil((percent + 1) * len(rows) / 100.0)

        pool.map(article_content_fetcher, rows[bot:top])

        clear_output(wait=True)
        print(f"Source article download progress {percent + 1}% complete")
    for n, row in enumerate(rows):
        row_writer(row, conn)

        clear_output(wait=True)
        print(f"Source article download progress 100% complete")
        print(f"{n + 1}/{len(rows)} rows written to postgreSQL table")

Source article download progress 12% complete


Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Process ForkPoolWorker-5:
Process ForkPoolWorker-2:
Process ForkPoolWorker-8:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Process ForkPoolWorker-1:
Traceback (most recent call last):
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py

KeyboardInterrupt: 

  File "/home/awalsh/.venvs/ga/lib/python3.10/site-packages/newspaper/network.py", line 62, in get_html_2XX_only
    response = requests.get(
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/http/client.py", line 1374, in getresponse
    response.begin()
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/awalsh/.venvs/ga/lib/python3.10/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "/usr/lib/python3.10/http/client.py", line 318, in begin
    version, status, reason = self._read_status()
  File "/home/awalsh/.venvs/ga/lib/python3.10/site