In [1]:
%load_ext lab_black
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [2]:
import spacy
import time
import datetime

import pandas as pd
import numpy as np

from psycopg2 import connect
from IPython.display import clear_output

In [3]:
cmd = """
      SELECT *
      FROM source_text
      WHERE 
          text IS NOT NULL AND
          title IS NOT NULL;
      """

with open("../etc/postgres.password") as psql_pass_file:
    postgres_password = psql_pass_file.read()
    conn = connect(
        f"host='localhost' dbname='gdelt' user='postgres' password='{postgres_password}'"
    )
    cursor = conn.cursor()
    cursor.execute(cmd)
    conn.commit()
    text_data = cursor.fetchall()

text_df = pd.DataFrame(
    text_data, columns=["id", "source", "date", "day", "title", "text"]
).sort_values(by="date")

text_df = text_df[text_df["date"] > pd.to_datetime("1921-01-01")]
text_df["date"] = pd.to_datetime(text_df["date"])

# https://stackoverflow.com/questions/51208570/get-the-first-day-of-the-week-for-a-pandas-series
text_df["date"] = text_df["date"] - text_df["date"].dt.weekday * np.timedelta64(1, "D")
text_df = text_df.set_index("date")

week_counts = text_df.groupby("date")["id"].count()
week_counts

date
2005-02-28      1
2005-03-07      2
2005-03-28      1
2005-04-11      1
2005-04-25      3
             ... 
2022-12-26    190
2023-01-02    213
2023-01-09    212
2023-01-16    201
2023-01-23    229
Name: id, Length: 588, dtype: int64

In [4]:
gas_prices = pd.read_csv("../data/combined.csv").set_index("date")["gas_price"]
delta = (gas_prices - gas_prices.shift(1)) / gas_prices
delta = delta.shift(1).fillna(0)

In [5]:
def binary_classes(delta):
    classes = (delta >= 0).astype(int) - (delta < 0).astype(int)

    return classes


def ternary_classes(delta, threshold):
    classes = (delta >= threshold).astype(int) - (delta <= -threshold).astype(int)

    return classes

In [6]:
binary_labels = binary_classes(delta)
ternary_labels = ternary_classes(
    delta, 0.01
)  # this threshold produces relatively balanced classes after null rows are dropped

labels = pd.DataFrame({"binary": binary_labels, "ternary": ternary_labels})
labels["date"] = pd.to_datetime(labels.reset_index()["date"]).values
labels = labels.set_index("date")
label_counts = pd.concat([labels, week_counts], axis=1, join="inner")
label_counts = label_counts.rename(columns={"id": "exploder"})
label_counts["exploder"] = [
    [1 for _ in range(c)] for c in label_counts["exploder"].values
]
label_counts = label_counts.explode("exploder").drop(columns=["exploder"])

In [7]:
data = pd.concat(
    [text_df.reset_index().drop(columns=["date"]), label_counts.reset_index()],
    join="inner",
    axis=1,
).reset_index()


data["date"] = pd.to_datetime(data["date"])
data = data.sort_values(by="date").set_index("date").drop(columns=["index"])
data

Unnamed: 0_level_0,id,source,day,title,text,binary,ternary
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-02-28,11601,http://www.4-traders.com/news/Top-Canada-econo...,20050228,Top Canada economy forecaster: Lower oil to hu...,Brent crude prices <LCOc1> halved between June...,-1,0
2005-03-07,23637,http://www.youthhealthmag.com/articles/11229/2...,20050308,Are You Using Fake Olive Oil?,Sign up to get the latest news delivered to yo...,1,0
2005-03-07,27148,http://www.4-traders.com/GAZPROM-OAO-6491735/n...,20050311,GAZPROM : Stock Market News and Information,Or log in with,1,0
2005-03-28,51079,http://customstoday.com.pk/ukraine-pays-329-pe...,20050328,Ukraine pays $329 per cubic meters of gas to R...,MOSCOW: Ukraine has paid $329 per thousand cub...,1,1
2005-04-11,76974,http://www.naturalgasintel.com/articles/101965...,20050415,Millennials’ Push for ‘Circular Economy’ Signa...,Opponents of oil and natural gas have discover...,1,1
...,...,...,...,...,...,...,...
2023-01-23,3470364,https://www.digitaljournal.com/world/protester...,20230124,Protesters brave tear gas to demand ouster of ...,A woman confronts a riot police cordon during ...,1,1
2023-01-23,3470749,https://www.aljazeera.com/news/2023/1/24/ugand...,20230124,"Uganda launches first oil drilling programme, ...",Uganda discovered commercial oil reserves near...,1,1
2023-01-23,3470519,https://wcfcourier.com/lifestyles/technology/n...,20230124,"Nevada governor touts education investment, su...","CARSON CITY, Nev. (AP) — Nevada Gov. Joe Lomba...",1,1
2023-01-23,3470640,https://www.finanznachrichten.de/nachrichten-2...,20230124,Oil-Dri Corporation Of America: Amlan® Interna...,"CHICAGO, Jan. 24, 2023 (GLOBE NEWSWIRE) -- Aml...",1,1


In [8]:
nlp = spacy.load("en_core_web_lg")

In [9]:
N = len(data)
N_chars = data["title"].apply(len).sum() + data["text"].apply(len).sum()

title_cleaned = []
text_cleaned = []

start_time = time.time()

chars_processed = 0

for idx, row in data.reset_index().iterrows():
    title_len = len(row["title"])
    text_len = len(row["text"])

    title = nlp(row["title"])
    text = nlp(row["text"])

    row_title_cleaned = []
    row_text_cleaned = []

    for token in title:
        if (
            not token.is_stop
            and not token.is_punct
            and not token.is_oov
            and not token.is_space
            and not token.is_digit
        ):
            token = str(token.lemma_).lower()
            row_title_cleaned.append(token)

    title_cleaned.append(" ".join(row_title_cleaned))

    for token in text:
        if (
            not token.is_stop
            and not token.is_punct
            and not token.is_oov
            and not token.is_space
            and not token.is_digit
        ):
            token = str(token.lemma_).lower()
            row_text_cleaned.append(token)

    text_cleaned.append(" ".join(row_text_cleaned))

    chars_processed += title_len + text_len

    elapsed_time = time.time() - start_time

    elapsed_time_tuple = str(datetime.timedelta(seconds=elapsed_time)).split(":")

    elapsed_time_string = f"{elapsed_time_tuple[0]}:{elapsed_time_tuple[1]}:{round(float(elapsed_time_tuple[2])):02}"

    estimated_time_remaining = (
        elapsed_time * (N_chars) / (chars_processed)
    ) - elapsed_time

    estimated_remaining_time_tuple = str(
        datetime.timedelta(seconds=estimated_time_remaining)
    ).split(":")

    estimated_remaining_time_string = f"{estimated_remaining_time_tuple[0]}:{estimated_remaining_time_tuple[1]}:{round(float(estimated_remaining_time_tuple[2])):02}"

    clear_output(wait=True)
    print(f"{idx + 1}/{N} rows parsed")
    print(f"Elapsed time: {elapsed_time_string}")
    print(f"Estimated remaining time: {estimated_remaining_time_string}")

75666/75666 rows parsed
Elapsed time: 1:11:06
Estimated remaining time: 0:00:00


In [10]:
data["title_cleaned"] = title_cleaned
data["text_cleaned"] = text_cleaned

In [11]:
with open("../data/opec_members.txt", "r") as opec, open(
    "../data/brent_producers.txt", "r"
) as brent, open("../data/wti_producers.txt", "r") as wti:
    opec_terms = opec.readlines()
    brent_terms = brent.readlines()
    wti_terms = wti.readlines()

data["is_opec"] = (
    (
        data["title_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in opec_terms])
        )
    )
    | (
        data["text_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in opec_terms])
        )
    )
).astype(int)

data["is_brent"] = (
    (
        data["title_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in brent_terms])
        )
    )
    | (
        data["text_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in brent_terms])
        )
    )
).astype(int)

data["is_wti"] = (
    (
        data["title_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in wti_terms])
        )
    )
    | (
        data["text_cleaned"].apply(
            lambda x: any([s.lower() in x.lower() for s in wti_terms])
        )
    )
).astype(int)

In [12]:
data["title_lemma_count"] = data["title_cleaned"].apply(lambda x: len(x.split()))
data["text_lemma_count"] = data["text_cleaned"].apply(lambda x: len(x.split()))

In [13]:
corpus = data[(data["title_lemma_count"] > 1) & (data["text_lemma_count"] > 150)]

In [14]:
corpus.to_csv("../data/corpus.csv")