In [1]:

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import sparknlp
spark = sparknlp.start() 
# sparknlp.start(gpu=True) >> for training on GPU
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from langdetect import detect
from pyspark.sql.functions import col,udf, lit, concat_ws
from pyspark.sql.types import FloatType

from sklearn.metrics import classification_report
import requests
from warcio import ArchiveIterator
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
import yfinance as yf
import boto3
import random
import sys 
import numpy as np
import os
#PARAMETERS
path_dl_model = os.path.join(os.getcwd(), 'models/model_dl') 
batch_size_max = sys.maxsize -1
num_records_percrawl = 10#int(os.environ['NUMRECORDS']) #number of recors to attempt to extract from each crawl
ticker = 'SPY'

number_warcs_to_analyze = 10 #int(os.environ['NUMWARCS']) #number of warcs to perform sentiment analysis on, goes from most reccent to farther back onse
randomsample = 'n' #str(os.environ['RANSAMPLE']).lower() #Y or N, if Y, then it will take a random sample of warcs to analyze, if N, it will take the most recent warcs


s3 = boto3.resource('s3')
my_bucket = s3.Bucket('commoncrawl')
warcs = []
for object in my_bucket.objects.filter(Prefix='crawl-data/CC-NEWS/'):
    if object.key.endswith('.warc.gz'):
        warcs.append(object.key)

if randomsample == 'y':
    warcs = random.sample(warcs, number_warcs_to_analyze)
else:
  warcs = warcs[-number_warcs_to_analyze:]

for index, warc in enumerate(warcs):
    warcs[index] = 'https://data.commoncrawl.org/' + warc


:: loading settings :: url = jar:file:/Users/johanvlassak/.virtualenvs/spark/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/johanvlassak/.ivy2/cache
The jars for the packages stored in: /Users/johanvlassak/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-eb788c25-f0cb-4c24-a2f5-cc3b1ae94c3f;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.4.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.16.0 in central
	found com.google.guava#guava;31.1-jre in central
	found com.googl

FileNotFoundError: [Errno 2] No such file or directory: './sentdat/topics.csv'

In [None]:
print('starting to load model')
FINDMODEL = PipelineModel.load(path_dl_model)
print('model loaded')

# Broadcast the model to avoid replication
FINDMODEL_broadcast = spark.sparkContext.broadcast(FINDMODEL)

# Create a user-defined function for the finance model
@udf(returnType=FloatType())
def is_finance(article_text):
    local_model = FINDMODEL_broadcast.value
    prediction = local_model.transform(article_text)
    return float(prediction['financial_model_pred.result'][0])

# Function to drop non-finance articles
def drop_nonfinance_articles(df):
    df = df.withColumn('finance', is_finance(df['article_text']))
    df = df.filter(df['finance'] == 1.0)
    return df

#function to convert time from commoncrawl format to y-m-d
def convert_header_date(date):
    return time.strftime('%Y-%m-%d', time.strptime(date, '%Y-%m-%dT%H:%M:%SZ'))


#obtaining stock data from yahoo finance from 2019 to current date.
currentdate = time.strftime("%Y-%m-%d")
stockdata = yf.download(ticker, start='2010-01-01', end=currentdate)['Adj Close']


In [None]:

#READING IN THE WARCS
from io import BytesIO

def extract_plaintext_from_html(html_content):
    """Remove HTML tags from content and return plaintext."""
    plaintext = re.sub('<[^<]+?>', ' ', html_content)
    return plaintext

def parse_warc_record(record, stockdata):
    """Parse WARC record and return tuple with date and plaintext."""
    if record.rec_type == 'response':
        url = record.rec_headers.get_header('WARC-Target-URI')
        content_type = record.http_headers.get_header('Content-Type')
        if content_type and 'text/html' in content_type:
            content = record.content_stream().read().decode('utf-8', 'replace')
            plaintext = extract_plaintext_from_html(content)
            if detect(plaintext) == 'en' and len(plaintext) > 150:
                date = record.rec_headers.get_header('WARC-Date')
                date = convert_header_date(date)
                if date in stockdata.index:
                    return (date, plaintext)
                else:
                    print('date not in stockdata', date)
    return None

def parse_warc(warc_data, stockdata):
    """Parse WARC data and return list of tuples with dates and plaintexts."""
    records = []
    with BytesIO(warc_data) as stream:
        for record in ArchiveIterator(stream):
            parsed_record = parse_warc_record(record, stockdata)
            if parsed_record:
                records.append(parsed_record)
    return records

def fetch_and_parse_warc(url, stockdata):
    """Fetch WARC data from URL and parse it to extract plaintext."""
    response = requests.get(url)
    if response.status_code == 200:
        warc_data = response.content
        return parse_warc(warc_data, stockdata)
    return []


date_plaintexts = []
for warc_url in warcs:
    warc_records = fetch_and_parse_warc(warc_url, stockdata)
    for index, record in enumerate(warc_records):
        warc_records[index] = (record[0], record[1], float(stockdata[record[0]]))
    date_plaintexts.extend(warc_records)

#creating scehma to store text and prices
data = StructType([\
  StructField("text", StringType(), True),
    StructField("price", StringType(), True),
    StructField("date", StringType(), True)  
]
)


#creating dataframe from schema
df = spark.createDataFrame(date_plaintexts, schema=data)

TODO
ADD SPARKWARC AND STUFF TO REQUIREMENTS.TXT
implement new dataframe import
Re-work the imports
Re-write the final analysis

In [None]:
#sort out the finance articles
df = drop_nonfinance_articles(df)

In [None]:


#CREATING THE PIPELINE FOR LATER
document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(['sentence']) \
    .setOutputCol('token')

lemmatizer = Lemmatizer()\
    .setInputCols(['token'])\
    .setOutputCol('lemma')\
  .setDictionary("./sentdat/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
SentimentDetector = sentiment.SentimentDetector() \
    .setInputCols(['lemma', 'sentence'])\
    .setOutputCol('sentiment_score')\
    .setDictionary('./sentdat/sentiment-big.csv', ',')\

pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer,
    lemmatizer,
    SentimentDetector
])

In [None]:
#INFERENCE

In [None]:
#PLOTTING CODE

from pyspark.sql.functions import col, avg
import matplotlib.pyplot as plt

agg_df = df.groupBy("date").agg(avg("sentiment_score").alias("average_sentiment"))

# Convert the aggregated PySpark DataFrame to a Pandas DataFrame
pd_agg_df = agg_df.toPandas()

# Convert 'date' column to pandas datetime object for better plotting
pd_agg_df['date'] = pd.to_datetime(pd_agg_df['date'])

# Sort the DataFrame by date
pd_agg_df = pd_agg_df.sort_values(by="date")

# Plot the average sentiment score as a function of date
plt.plot(pd_agg_df["date"], pd_agg_df["average_sentiment"])
plt.xlabel("Date")
plt.ylabel("Average Sentiment Score")
plt.title("Average Sentiment Score by Date")
plt.show()

# Stop the Spark session
spark.stop()
