In [2]:
%package install pyspark

Running: micromamba install pyspark --yes --log-level=error

  Package     Version  Build         Channel          Size
────────────────────────────────────────────────────────────
  Install:
────────────────────────────────────────────────────────────

  [32m+ py4j   [0m  0.10.9.7  pyhd8ed1ab_0  conda-forge     186kB
  [32m+ pyspark[0m     3.5.5  pyhd8ed1ab_0  conda-forge     311MB

  Summary:

  Install: 2 packages

  Total download: 311MB

Note: Packages not from Bloomberg channels are not vetted by Bloomberg.
[93mPlease restart the Jupyter kernel if you run into any issues after installing or updating packages via %package.[0m



In [3]:
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType

import boto3
from IPython.display import display

import os

In [4]:
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

def get_spark_session(
    executors="10",
    executor_memory="8g",
    driver_memory="32g",
    executor_cores="2",
    driver_max_result_size="1024M",
    executor_memory_overhead="2g",
    task_cpus="1",
):

    spark = (
        SparkSession.builder.config("spark.driver.memory", driver_memory)
        .config("spark.driver.maxResultSize", driver_max_result_size)
        .config("spark.executor.memoryOverhead", executor_memory_overhead)
        .config("spark.executor.instances", executors)
        .config("spark.executor.memory", executor_memory)
        .config("spark.executor.cores", executor_cores)
        .config("spark.task.cpus", task_cpus)
        .config("spark.sql.execution.arrow.enabled", "true")
        .config("spark.shuffle.file.buffer", "1m")
        .config("spark.file.transferTo", "False")
        .config("spark.shuffle.unsafe.file.output.buffer", "1m")
        .config("spark.io.compression.lz4.blockSize", "512k")
        .config("spark.shuffle.service.index.cache.size", "1g")
        .config("spark.shuffle.registration.timeout", "120000ms")
        .config("spark.shuffle.registration.maxAttempts", "3")
        .config("spark.sql.windowExec.buffer.spill.threshold", "1000000")
        .config("spark.sql.windowExec.buffer.in.memory.threshold", "1000000")
        .getOrCreate()
    )

    display(spark)

    return spark

In [5]:
spark = get_spark_session(executors="100", executor_memory="8g", executor_cores="2")

FileNotFoundError: [Errno 2] No such file or directory: '/opt/spark/./bin/spark-submit'

In [None]:
%%time
bucket_name = "bquant-data-textual-analytics-tier-1"
bucket = boto3.resource("s3").Bucket(bucket_name)
files = [file.key for file in bucket.objects.all()]

files_csv = [
    f"s3://{bucket_name}/{file}"
    for file in files
    if "EID80001" in file and "csv" in file
]

df = (
    spark.read.option("header", "true")
    .option("multiLine", "true")
    .option("escape", "")
    .csv(files_csv)
)

In [None]:
index_members = ['BBG000B9XRY4', 'BBG000BBJQV0', 'BBG000BBS2Y0', 'BBG000BCQZS4',
       'BBG000BCSST7', 'BBG000BF0K17', 'BBG000BH4R78', 'BBG000BJ81C1',
       'BBG000BKZB36', 'BBG000BLNNH6', 'BBG000BMHYD1', 'BBG000BMX289',
       'BBG000BN2DC2', 'BBG000BNSZP1', 'BBG000BP52R2', 'BBG000BPD168',
       'BBG000BPH459', 'BBG000BR2B91', 'BBG000BR2TH3', 'BBG000BSXQV7',
       'BBG000BVPV84', 'BBG000BW8S60', 'BBG000BWLMJ4', 'BBG000BWXBC2',
       'BBG000C0G1D1', 'BBG000C3J3C9', 'BBG000C5HS04', 'BBG000C6CFJ5',
       'BBG000CH5208', 'BBG000DMBXR2', 'BBG000GZQ728', 'BBG000H556T9',
       'BBG000HS77T5', 'BBG000K4ND22', 'BBG000PSKYX7', 'BBG00BN96922']

In [None]:
%%time
from functools import reduce

topics = {
    'INDU': 'ALLTOP'
}

topic_filter = reduce(
    lambda x, y: x | y,
    [
        F.array_contains(F.split(F.col(col), ";"), topic)
        for topic in topics.values()
        for col in ["AssignedTopicsId", "DerivedTopicsId"]
    ],
)

# Filter for just BBG news or include all news articles in the analysis.
wire_filter = (F.col("WireName") == "BN") | (F.col("WireName") == "BFW")

filters = (
    topic_filter
    & wire_filter
    & (F.col("LanguageString") == "ENGLISH")
    & (F.length(F.col("Headline")) > 25)
    & (F.col("TimeOfArrival") >= "2009-11-01")
    & (F.col("Assigned_ID_BB_GLOBAL").isin(index_members))
    & (F.col("Headline").startswith("*"))
)

df = df.withColumn("TimeOfArrival", F.col("TimeOfArrival").cast(TimestampType()))
df1 = df.filter(filters)

df1 = df1.cache()
df1.count()