## Initialize Spark Session:

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, concat
import findspark
import logging
import time
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
findspark.init()

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
start_time = time.time()
def log_time_taken(start, operation):
    end = time.time()
    logger.info(f"{operation} completed in {end - start:.2f} seconds")

# Start timing and log the initialization of the Spark session
logger.info("Initializing Spark session with optimized memory settings")
spark = SparkSession.builder \
    .appName("Reddit Comment Context Builder") \
    .master("local[*]")  \
    .config("spark.executor.memory", "16g")  \
    .config("spark.driver.memory", "8g")  \
    .config("spark.executor.memoryOverhead", "4096") \
    .config("spark.driver.memoryOverhead", "2048")  \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.driver.extraClassPath", "/Volumes/LaCie/wsb_archive/postgresql-42.7.3.jar") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.kryoserializer.buffer.max", "200M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.2")\
    .config("spark.sql.shuffle.partitions", "200")\
    .getOrCreate()
log_time_taken(start_time, "SparkSession initialization")

2024-04-05 18:52:22,162 - INFO - Initializing Spark session with optimized memory settings


:: loading settings :: url = jar:file:/Users/binmingli/spark-3.3.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/binmingli/.ivy2/cache
The jars for the packages stored in: /Users/binmingli/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9b4f627a-653b-462d-9417-6aae07a5439a;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.2 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 i

24/04/05 18:54:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2024-04-05 18:54:53,024 - INFO - SparkSession initialization completed in 150.86 seconds


## Utils:

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, lower
import yfinance as yf
import re

# Define the class for simplifying company names
class CompanyNameSimplifier:
    def __init__(self):
        self.suffixes = [
            'Inc.', 'Inc', 'Corporation', 'Corp.', 'Corp', 'Company', 'Co.', 'Co', 
            'Limited', 'Ltd.', 'Ltd', 'PLC', 'NV', 'SA', 'AG', 'LLC', 'L.P.', 'LP'
        ]
        self.web_domains_regex = r'\.com|\.org|\.net|\.io|\.co|\.ai'

    def simplify_company_name(self, name):
        name = re.sub(self.web_domains_regex, '', name, flags=re.IGNORECASE)
        for suffix in self.suffixes:
            if name.endswith(suffix):
                name = name.replace(suffix, '')
                break
        name = re.split(',| -', name)[0]
        name = name.strip()
        return name

    def get_simplified_company_name(self, ticker):
        company = yf.Ticker(ticker)
        company_info = company.info
        full_name = company_info.get('longName', '')
        simple_name = self.simplify_company_name(full_name)
        return simple_name

2024-04-05 18:55:17,852 - INFO - Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-04-05 18:55:17,852 - INFO - NumExpr defaulting to 8 threads.


## Filter the data from the WSB_context_file by ticker

In [3]:
from pyspark.sql.functions import col, lower

def filter_comments_by_ticker(df, ticker):
    simplifier = CompanyNameSimplifier()
    # Obtain the simplified company name for the given ticker
    company_name = simplifier.get_simplified_company_name(ticker)
    
    # Convert the ticker and company name to lowercase for a case-insensitive search
    ticker_lower = ticker.lower()
    company_name_lower = company_name.lower()

    # Filter the DataFrame for rows where the `comment_context` contains the ticker or the company name
    # Uses `lower` function to ensure that the search is case-insensitive
    filtered_df = df.filter(
        lower(col("comment_context")).contains(ticker_lower) | 
        lower(col("comment_context")).contains(company_name_lower)
    ).select("datetime_utc", "comment_score", "comment_body")

    return filtered_df

ticker = 'NVDA'
wsb_comments_with_context = spark.read.parquet("./wsb_comments_with_context")
stock_comments = filter_comments_by_ticker(wsb_comments_with_context, ticker)
stock_comments.write.parquet('./stock_comments/{ticker}_comments')

                                                                                

## Spark NLP Sentimental Analysis on wsb comments

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, explode, when, size, avg
from sparknlp.pretrained import PretrainedPipeline

class StockSentimentAnalyzer:
    def __init__(self, ticker):
        self.ticker = ticker
        self.pipeline = PretrainedPipeline('analyze_sentiment', lang='en')
        self.spark = SparkSession.builder.appName("StockSentimentAnalysis").getOrCreate()

    def load_comments(self):
        path = f"./stock_comments/{self.ticker}_comments"
        self.df = self.spark.read.parquet(path)
        return self.df

    def analyze_sentiment(self):
        # Rename column for analysis
        df_renamed = self.df.withColumnRenamed("comment_body", "text")
        result = self.pipeline.transform(df_renamed)

        # Select and rename the columns of interest
        sentiment_df = result.select(
            col("datetime_utc"),
            col("comment_score"),
            col("text").alias("comment_body"),
            col("sentiment.result").alias("comment_sentiment")
        )
        return sentiment_df

    def process_sentiments(self, df):
        # Filter out rows where 'comment_sentiment' is not 'na'
        filtered_df = df.filter(size(col("comment_sentiment")) > 0)

        # Explode the sentiment array
        exploded_df = filtered_df.withColumn("individual_sentiment", explode(col("comment_sentiment")))

        # Assign scores to sentiments
        scored_df = exploded_df.withColumn("sentiment_score",
                                           when(col("individual_sentiment") == "positive", 1)
                                           .when(col("individual_sentiment") == "negative", -1)
                                           .otherwise(0))

        # Group by and calculate average sentiment score
        stock_sentiment = scored_df.groupBy("datetime_utc", "comment_score", "comment_body").agg(avg("sentiment_score").alias("sentiment_score"))

        return stock_sentiment.orderBy("datetime_utc")

    def save_sentiments(self, df):
        path = f"./stock_sentiments/{self.ticker}_sentiment"
        df.write.mode("overwrite").parquet(path)

    def run(self):
        self.load_comments()
        sentiment_df = self.analyze_sentiment()
        processed_df = self.process_sentiments(sentiment_df)
        self.save_sentiments(processed_df)
        return processed_df

# Example usage:
ticker = "NVDA"
analyzer = StockSentimentAnalyzer(ticker)
stock_sentiment = analyzer.run()

### Original Code:

In [5]:
from pyspark.sql.functions import col, explode, when, size, avg

# Load a pre-trained sentiment analysis pipeline
ticker = 'NVDA'
pipeline = PretrainedPipeline('analyze_sentiment', lang='en')
df = spark.read.parquet(f"./stock_comments/{ticker}_comments")

# Apply the sentiment analysis pipeline to the comment_body column
df_renamed = df.withColumnRenamed("comment_body", "text")
result = pipeline.transform(df_renamed)

# Show some columns including sentiment result
result.select("text", "sentiment.result").show(truncate=False)
stock_sentiment = result.select(
    col("datetime_utc"),
    col("comment_score"),
    col("text").alias("comment_body"),
    col("sentiment.result").alias("comment_sentiment")
)
# Filter out rows where 'comment_sentiment' contains only 'na'
filtered_df = stock_sentiment.filter(size(col("comment_sentiment")) > 0)

# Explode the sentiment array to work with individual sentiments
exploded_df = filtered_df.withColumn("individual_sentiment", explode(col("comment_sentiment")))

# Assign scores to sentiments: +1 for 'positive', -1 for 'negative', and 0 for 'na'
scored_df = exploded_df.withColumn("sentiment_score",
                                   when(col("individual_sentiment") == "positive", 1)
                                   .when(col("individual_sentiment") == "negative", -1)
                                   .otherwise(0))

# Group by original DataFrame identifiers (if necessary) and calculate average sentiment score
# 'datetime_utc' and 'comment_score' can serve as a unique identifier for rows in 'stock_sentiment'
stock_sentiment = scored_df.groupBy("datetime_utc", "comment_score", "comment_body").agg(avg("sentiment_score").alias("sentiment_score"))

stock_sentiment = stock_sentiment.orderBy("datetime_utc")
stock_sentiment.show()
# stock_sentiment.write.parquet(f"./stock_sentiments/{ticker}_sentiment")

analyze_sentiment download started this may take some time.
Approx size to download 4.8 MB
[OK!]
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                              



+-------------------+-------------+--------------------+-------------------+
|       datetime_utc|comment_score|        comment_body|    sentiment_score|
+-------------------+-------------+--------------------+-------------------+
|2012-05-11 14:12:50|            5|You gotta get the...|               -1.0|
|2012-05-11 15:30:06|            2|           [deleted]|                0.0|
|2012-05-11 15:39:28|            3|           [deleted]|                0.0|
|2012-05-11 16:07:45|            5|http://qkme.me/3p...|                0.0|
|2012-05-11 16:08:29|           10|But r/investing i...|                0.0|
|2012-05-11 16:14:17|            6|           [deleted]|                0.0|
|2012-05-11 16:18:36|            2|                 lol|                1.0|
|2012-05-11 16:18:58|            7|Hey guys! I have ...|-0.3333333333333333|
|2012-05-11 16:22:44|            4|Care to teach me ...|                1.0|
|2012-05-11 16:23:47|            5|           [deleted]|                0.0|

                                                                                

### Refactored Version:

In [9]:
from pyspark.sql.functions import col, explode, when, size, avg
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

class SentimentAnalyzer:
    def __init__(self, ticker, spark):
        self.ticker = ticker
        self.pipeline = PretrainedPipeline('analyze_sentiment', lang='en')
        self.spark = spark

    def analyze(self):
        df = self.spark.read.parquet(f"./stock_comments/{self.ticker}_comments")
        df_renamed = df.withColumnRenamed("comment_body", "text")
        result = self.pipeline.transform(df_renamed)

        stock_sentiment = result.select(
            col("datetime_utc"),
            col("comment_score"),
            col("text").alias("comment_body"),
            col("sentiment.result").alias("comment_sentiment")
        )

        filtered_df = stock_sentiment.filter(size(col("comment_sentiment")) > 0)
        exploded_df = filtered_df.withColumn("individual_sentiment", explode(col("comment_sentiment")))

        scored_df = exploded_df.withColumn("sentiment_score",
                                           when(col("individual_sentiment") == "positive", 1)
                                           .when(col("individual_sentiment") == "negative", -1)
                                           .otherwise(0))

        stock_sentiment = scored_df.groupBy("datetime_utc", "comment_score", "comment_body").agg(avg("sentiment_score").alias("sentiment_score"))
        stock_sentiment = stock_sentiment.orderBy("datetime_utc")

        return stock_sentiment

    def write_to_parquet(self, df):
        df.write.parquet(f"./stock_sentiments/{self.ticker}_sentiment")

# Example usage:
analyzer = SentimentAnalyzer('NVDA', spark)
sentiment_df = analyzer.analyze()
analyzer.write_to_parquet(sentiment_df)
sentiment_df.show()

analyze_sentiment download started this may take some time.
Approx size to download 4.8 MB
[OK!]




+-------------------+-------------+--------------------+-------------------+
|       datetime_utc|comment_score|        comment_body|    sentiment_score|
+-------------------+-------------+--------------------+-------------------+
|2012-05-11 14:12:50|            5|You gotta get the...|               -1.0|
|2012-05-11 15:30:06|            2|           [deleted]|                0.0|
|2012-05-11 15:39:28|            3|           [deleted]|                0.0|
|2012-05-11 16:07:45|            5|http://qkme.me/3p...|                0.0|
|2012-05-11 16:08:29|           10|But r/investing i...|                0.0|
|2012-05-11 16:14:17|            6|           [deleted]|                0.0|
|2012-05-11 16:18:36|            2|                 lol|                1.0|
|2012-05-11 16:18:58|            7|Hey guys! I have ...|-0.3333333333333333|
|2012-05-11 16:22:44|            4|Care to teach me ...|                1.0|
|2012-05-11 16:23:47|            5|           [deleted]|                0.0|

                                                                                

## Convert the original sentimental feature score to the percentages, also join the popularity:

### Popularity Calculator

In [11]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lower
import yfinance as yf
import re

class PopularityCalculator:
    def __init__(self, ticker, df, simplifier):
        self.ticker = ticker
        self.df = df
        self.simplifier = simplifier

    def calculate_popularity(self):
        # Convert to Eastern Time and simplify the company name
        df = self.df.withColumn("datetime_et", F.expr("from_utc_timestamp(datetime_utc, 'America/New_York')"))
        simplified_name = self.simplifier.get_simplified_company_name(self.ticker).lower()

        # Filter comments by ticker or company name
        filtered_comments = df.filter(
            lower(col("comment_context")).contains(self.ticker.lower()) |
            lower(col("comment_context")).contains(simplified_name)
        )

        # Aggregate daily mentions and total comments
        ticker_mentions = filtered_comments.groupBy(F.to_date("datetime_et").alias("date")).count().withColumnRenamed("count", "ticker_mentions")
        total_comments = df.groupBy(F.to_date("datetime_et").alias("date")).count().withColumnRenamed("count", "total_comments")

        # Calculate popularity percentage and sort by date
        popularity = ticker_mentions.join(total_comments, on="date") \
            .withColumn("popularity_percentage", F.col("ticker_mentions") / F.col("total_comments") * 100) \
            .orderBy("date")

        # Save the result
        save_path = f'./stock_popularity/{self.ticker}_popularity'
        popularity.write.mode('overwrite').parquet(save_path)

        return popularity

# Example usage:
ticker = "NVDA"
df = spark.read.parquet("./wsb_comments_with_context")
simplifier = CompanyNameSimplifier()
popularity_calculator = PopularityCalculator(ticker, df, simplifier)
stock_popularity = popularity_calculator.calculate_popularity()

                                                                                

### Sentiment as Percentages

#### Original Version:

In [13]:
ticker = 'NVDA'
stock_sentiment = spark.read.parquet(f'./stock_sentiments/{ticker}_sentiment')
df = stock_sentiment
# Function to categorize sentiment score
def categorize_sentiment(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the categorization function
df_with_sentiment_category = df.withColumn(
    "sentiment_category",
    when(df.sentiment_score > 0.05, "positive")
    .when(df.sentiment_score < -0.05, "negative")
    .otherwise("neutral")
)
# Convert datetime_utc to Eastern Time (ET, covering both EST and EDT as appropriate)
df = df_with_sentiment_category.withColumn("datetime_et", F.expr("from_utc_timestamp(datetime_utc, 'America/New_York')"))

# Extract date from datetime_et
df = df.withColumn("date", F.to_date("datetime_et"))

# Group by date and calculate percentages
result = df.groupBy("date").agg(
    F.expr("count(1) as total_comments"),
    F.sum(F.when(F.col("sentiment_category") == "positive", 1).otherwise(0)).alias("positive_count"),
    F.sum(F.when(F.col("sentiment_category") == "neutral", 1).otherwise(0)).alias("neutral_count"),
    F.sum(F.when(F.col("sentiment_category") == "negative", 1).otherwise(0)).alias("negative_count")
).withColumn(
    "positive_percentage", F.col("positive_count") / F.col("total_comments") * 100
).withColumn(
    "neutral_percentage", F.col("neutral_count") / F.col("total_comments") * 100
).withColumn(
    "negative_percentage", F.col("negative_count") / F.col("total_comments") * 100
)
result = result.drop("total_comments")
result = result.orderBy("date")
# Show the result
result.show()
# result.write.mode("overwrite").parquet(f"./stock_sentiments_percentage/{ticker}_sentiment_percentage")

+----------+--------------+-------------+--------------+-------------------+------------------+-------------------+
|      date|positive_count|neutral_count|negative_count|positive_percentage|neutral_percentage|negative_percentage|
+----------+--------------+-------------+--------------+-------------------+------------------+-------------------+
|2012-05-11|             9|           15|            12|               25.0| 41.66666666666667|  33.33333333333333|
|2012-05-12|             2|            1|             3|  33.33333333333333|16.666666666666664|               50.0|
|2012-06-18|             0|            1|             0|                0.0|             100.0|                0.0|
|2012-08-08|             0|            1|             0|                0.0|             100.0|                0.0|
|2012-08-09|             4|            6|             8|  22.22222222222222| 33.33333333333333|  44.44444444444444|
|2012-08-10|             1|            2|             7|               1

### Refactored Version

In [14]:
from pyspark.sql import functions as F
from pyspark.sql.functions import when

class StockSentimentPercentageAnalyzer:
    def __init__(self, ticker):
        self.ticker = ticker
        self.df = spark.read.parquet(f'./stock_sentiments/{ticker}_sentiment')

    def categorize_sentiment(self):
        df_with_sentiment_category = self.df.withColumn(
            "sentiment_category",
            when(self.df.sentiment_score > 0.05, "positive")
            .when(self.df.sentiment_score < -0.05, "negative")
            .otherwise("neutral")
        )
        return df_with_sentiment_category

    def analyze_sentiment(self):
        df = self.categorize_sentiment()
        df = df.withColumn("datetime_et", F.expr("from_utc_timestamp(datetime_utc, 'America/New_York')"))
        df = df.withColumn("date", F.to_date("datetime_et"))

        result = df.groupBy("date").agg(
            F.expr("count(1) as total_comments"),
            F.sum(F.when(F.col("sentiment_category") == "positive", 1).otherwise(0)).alias("positive_count"),
            F.sum(F.when(F.col("sentiment_category") == "neutral", 1).otherwise(0)).alias("neutral_count"),
            F.sum(F.when(F.col("sentiment_category") == "negative", 1).otherwise(0)).alias("negative_count")
        ).withColumn(
            "positive_percentage", F.col("positive_count") / F.col("total_comments") * 100
        ).withColumn(
            "neutral_percentage", F.col("neutral_count") / F.col("total_comments") * 100
        ).withColumn(
            "negative_percentage", F.col("negative_count") / F.col("total_comments") * 100
        )

        result = result.drop("total_comments").orderBy("date")
        return result

    def save_result(self, result):
        result.write.mode('overwrite').parquet(f"./stock_sentiments_percentage/{self.ticker}_sentiment_percentage")

# Example usage:
analyzer = StockSentimentPercentageAnalyzer('NVDA')
result = analyzer.analyze_sentiment()
result.show()
analyzer.save_result(result)

+----------+--------------+-------------+--------------+-------------------+------------------+-------------------+
|      date|positive_count|neutral_count|negative_count|positive_percentage|neutral_percentage|negative_percentage|
+----------+--------------+-------------+--------------+-------------------+------------------+-------------------+
|2012-05-11|             9|           15|            12|               25.0| 41.66666666666667|  33.33333333333333|
|2012-05-12|             2|            1|             3|  33.33333333333333|16.666666666666664|               50.0|
|2012-06-18|             0|            1|             0|                0.0|             100.0|                0.0|
|2012-08-08|             0|            1|             0|                0.0|             100.0|                0.0|
|2012-08-09|             4|            6|             8|  22.22222222222222| 33.33333333333333|  44.44444444444444|
|2012-08-10|             1|            2|             7|               1

### Join the data to get the final table:

In [16]:
from pyspark.sql.functions import col

class StockDataMerger:
    def __init__(self, ticker):
        self.ticker = ticker
        self.spark = spark

    def merge_data(self):
        # Read stock popularity and sentiment percentage data
        stock_popularity = self.spark.read.parquet(f"./stock_popularity/{self.ticker}_popularity")
        stock_sentiment_percentage = self.spark.read.parquet(f"./stock_sentiments_percentage/{self.ticker}_sentiment_percentage")

        # Inner join on date
        stock_sentiment_and_popularity = stock_popularity.join(stock_sentiment_percentage, "date", "inner")

        # Selecting and renaming the desired columns
        stock_sentiment_and_popularity = stock_sentiment_and_popularity.select(
            col("date"),
            col("popularity_percentage").alias("popularity"),
            col("positive_percentage").alias("positive"),
            col("neutral_percentage").alias("neutral"),
            col("negative_percentage").alias("negative")
        )

        return stock_sentiment_and_popularity

    def write_data(self, df):
        df.write.mode('overwrite').parquet(f"./stock_sentiment_and_popularity/{self.ticker}_sentiment_percentage")

# Example usage:
merger = StockDataMerger("NVDA")
merged_data = merger.merge_data()
merger.write_data(merged_data)
merged_data.show()

+----------+------------------+-----------------+------------------+-----------------+
|      date|        popularity|         positive|           neutral|         negative|
+----------+------------------+-----------------+------------------+-----------------+
|2012-05-11| 25.53191489361702|             25.0| 41.66666666666667|33.33333333333333|
|2012-05-12|15.789473684210526|33.33333333333333|16.666666666666664|             50.0|
|2012-06-18|               2.5|              0.0|             100.0|              0.0|
|2012-08-08|2.1739130434782608|              0.0|             100.0|              0.0|
|2012-08-09| 33.33333333333333|22.22222222222222| 33.33333333333333|44.44444444444444|
|2012-08-10|19.607843137254903|             10.0|              20.0|             70.0|
|2012-08-13|3.7037037037037033|            100.0|               0.0|              0.0|
|2012-08-17| 2.898550724637681|             50.0|              50.0|              0.0|
|2012-09-18|              25.0|66.666666666