In [9]:
import findspark
import logging
import time
import re
import yfinance as yf
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, explode, when, size, avg
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

findspark.init()
# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
start_time = time.time()
def log_time_taken(start, operation):
    end = time.time()
    logger.info(f"{operation} completed in {end - start:.2f} seconds")

# Start timing and log the initialization of the Spark session
logger.info("Initializing Spark session with optimized memory settings")
spark = SparkSession.builder \
    .appName("Reddit Comment Context Builder") \
    .master("local[*]")  \
    .config("spark.executor.memory", "16g")  \
    .config("spark.driver.memory", "8g")  \
    .config("spark.executor.memoryOverhead", "4096") \
    .config("spark.driver.memoryOverhead", "2048")  \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.driver.extraClassPath", "/Volumes/LaCie/wsb_archive/postgresql-42.7.3.jar") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.kryoserializer.buffer.max", "200M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.2")\
    .config("spark.sql.shuffle.partitions", "200")\
    .getOrCreate()
log_time_taken(start_time, "SparkSession initialization")

# Define the class for simplifying company names
class CompanyNameSimplifier:
    def __init__(self):
        self.suffixes = [
            'Inc.', 'Inc', 'Corporation', 'Corp.', 'Corp', 'Company', 'Co.', 'Co', 
            'Limited', 'Ltd.', 'Ltd', 'PLC', 'NV', 'SA', 'AG', 'LLC', 'L.P.', 'LP'
        ]
        self.web_domains_regex = r'\.com|\.org|\.net|\.io|\.co|\.ai'

    def simplify_company_name(self, name):
        name = re.sub(self.web_domains_regex, '', name, flags=re.IGNORECASE)
        for suffix in self.suffixes:
            if name.endswith(suffix):
                name = name.replace(suffix, '')
                break
        name = re.split(',| -', name)[0]
        name = name.strip()
        return name

    def get_simplified_company_name(self, ticker):
        company = yf.Ticker(ticker)
        company_info = company.info
        full_name = company_info.get('longName', '')
        simple_name = self.simplify_company_name(full_name)
        return simple_name

class StockCommentsFilter:
    def __init__(self, ticker):
        self.ticker = ticker
        self.wsb_comments_with_context = spark.read.parquet("./wsb_comments_with_context")

    def filter_comments_by_ticker(self):
        simplifier = CompanyNameSimplifier()
        # Obtain the simplified company name for the given ticker
        company_name = simplifier.get_simplified_company_name(self.ticker)
        
        # Convert the ticker and company name to lowercase for a case-insensitive search
        ticker_lower = self.ticker.lower()
        company_name_lower = company_name.lower()

        # Filter the DataFrame for rows where the `comment_context` contains the ticker or the company name
        # Uses `lower` function to ensure that the search is case-insensitive
        filtered_df = self.wsb_comments_with_context.filter(
            lower(col("comment_context")).contains(ticker_lower) | 
            lower(col("comment_context")).contains(company_name_lower)
        ).select("datetime_utc", "comment_score", "comment_body")
        filtered_df.write.mode('overwrite').parquet(f'./stock_comments/{self.ticker}_comments')
        return filtered_df

class SentimentAnalyzer:
    def __init__(self, ticker):
        self.ticker = ticker
        self.pipeline = PretrainedPipeline('analyze_sentiment', lang='en')
        self.spark = spark

    def analyze(self):
        df = self.spark.read.parquet(f"./stock_comments/{self.ticker}_comments")
        df_renamed = df.withColumnRenamed("comment_body", "text")
        result = self.pipeline.transform(df_renamed)

        stock_sentiment = result.select(
            col("datetime_utc"),
            col("comment_score"),
            col("text").alias("comment_body"),
            col("sentiment.result").alias("comment_sentiment")
        )

        filtered_df = stock_sentiment.filter(size(col("comment_sentiment")) > 0)
        exploded_df = filtered_df.withColumn("individual_sentiment", explode(col("comment_sentiment")))

        scored_df = exploded_df.withColumn("sentiment_score",
                                           when(col("individual_sentiment") == "positive", 1)
                                           .when(col("individual_sentiment") == "negative", -1)
                                           .otherwise(0))

        stock_sentiment = scored_df.groupBy("datetime_utc", "comment_score", "comment_body").agg(avg("sentiment_score").alias("sentiment_score"))
        stock_sentiment = stock_sentiment.orderBy("datetime_utc")
        stock_sentiment.write.mode('overwrite').parquet(f"./stock_sentiments/{self.ticker}_sentiment")
        return stock_sentiment

class PopularityCalculator:
    def __init__(self, ticker, df, simplifier):
        self.ticker = ticker
        self.df = df
        self.simplifier = simplifier

    def calculate_popularity(self):
        # Convert to Eastern Time and simplify the company name
        df = self.df.withColumn("datetime_et", F.expr("from_utc_timestamp(datetime_utc, 'America/New_York')"))
        simplified_name = self.simplifier.get_simplified_company_name(self.ticker).lower()

        # Filter comments by ticker or company name
        filtered_comments = df.filter(
            lower(col("comment_context")).contains(self.ticker.lower()) |
            lower(col("comment_context")).contains(simplified_name)
        )

        # Aggregate daily mentions and total comments
        ticker_mentions = filtered_comments.groupBy(F.to_date("datetime_et").alias("date")).count().withColumnRenamed("count", "ticker_mentions")
        total_comments = df.groupBy(F.to_date("datetime_et").alias("date")).count().withColumnRenamed("count", "total_comments")

        # Calculate popularity percentage and sort by date
        popularity = ticker_mentions.join(total_comments, on="date") \
            .withColumn("popularity_percentage", F.col("ticker_mentions") / F.col("total_comments") * 100) \
            .orderBy("date")

        # Save the result
        save_path = f'./stock_popularity/{self.ticker}_popularity'
        popularity.write.mode('overwrite').parquet(save_path)

        return popularity


class StockSentimentPercentageAnalyzer:
    def __init__(self, ticker):
        self.ticker = ticker
        self.df = spark.read.parquet(f'./stock_sentiments/{ticker}_sentiment')

    def categorize_sentiment(self):
        df_with_sentiment_category = self.df.withColumn(
            "sentiment_category",
            when(self.df.sentiment_score > 0.05, "positive")
            .when(self.df.sentiment_score < -0.05, "negative")
            .otherwise("neutral")
        )
        return df_with_sentiment_category

    def analyze_sentiment(self):
        df = self.categorize_sentiment()
        df = df.withColumn("datetime_et", F.expr("from_utc_timestamp(datetime_utc, 'America/New_York')"))
        df = df.withColumn("date", F.to_date("datetime_et"))

        result = df.groupBy("date").agg(
            F.expr("count(1) as total_mentions"),
            F.sum(F.when(F.col("sentiment_category") == "positive", 1).otherwise(0)).alias("positive_count"),
            F.sum(F.when(F.col("sentiment_category") == "neutral", 1).otherwise(0)).alias("neutral_count"),
            F.sum(F.when(F.col("sentiment_category") == "negative", 1).otherwise(0)).alias("negative_count")
        ).withColumn(
            "positive_percentage", F.col("positive_count") / F.col("total_mentions") * 100
        ).withColumn(
            "neutral_percentage", F.col("neutral_count") / F.col("total_mentions") * 100
        ).withColumn(
            "negative_percentage", F.col("negative_count") / F.col("total_mentions") * 100
        )

        result = result.orderBy("date")
        result.write.mode('overwrite').parquet(f"./stock_sentiments_percentage/{self.ticker}_sentiment_percentage")
        return result

class StockDataMerger:
    def __init__(self, ticker):
        self.ticker = ticker
        self.spark = spark

    def merge_data(self):
        # Read stock popularity and sentiment percentage data
        stock_popularity = self.spark.read.parquet(f"./stock_popularity/{self.ticker}_popularity")
        stock_sentiment_percentage = self.spark.read.parquet(f"./stock_sentiments_percentage/{self.ticker}_sentiment_percentage")

        # Inner join on date
        stock_sentiment_and_popularity = stock_popularity.join(stock_sentiment_percentage, "date", "inner")

        # Selecting and renaming the desired columns
        stock_sentiment_and_popularity = stock_sentiment_and_popularity.select(
            col("date"),
            col("total_mentions").alias("mentions"),
            col("popularity_percentage").alias("popularity"),
            col("positive_percentage").alias("positive"),
            col("neutral_percentage").alias("neutral"),
            col("negative_percentage").alias("negative")
        )
        # Add a new column with the ticker
        stock_sentiment_and_popularity = stock_sentiment_and_popularity.withColumn('ticker', F.lit(self.ticker))

        stock_sentiment_and_popularity.write.mode('overwrite').parquet(f"./stock_sentiment_and_popularity/{self.ticker}_sentiment_and_popularity")
        return stock_sentiment_and_popularity

def main():
    # Step 01: Filter comments by ticker
    ticker = 'AAPL'
    stock_filter = StockCommentsFilter(ticker)
    stock_filter.filter_comments_by_ticker()

    # Step 02: Analyze sentiment
    analyzer = SentimentAnalyzer(ticker)
    analyzer.analyze()

    # Step 03: Calculate popularity
    df = spark.read.parquet("./wsb_comments_with_context")
    simplifier = CompanyNameSimplifier()
    popularity_calculator = PopularityCalculator(ticker, df, simplifier)
    popularity_calculator.calculate_popularity()

    # Step 04: Calculate sentiment percentage
    analyzer = StockSentimentPercentageAnalyzer(ticker)
    analyzer.analyze_sentiment()

    # Step 05: Merge popularity and sentiment percentage data
    merger = StockDataMerger(ticker)
    merger.merge_data()
    
    # Step 06: Show the merged data
    spark.read.parquet(f"./stock_sentiment_and_popularity/{ticker}_sentiment_and_popularity").show()

if __name__ == "__main__":
    main()

2024-04-05 21:12:26,691 - INFO - Initializing Spark session with optimized memory settings
2024-04-05 21:12:26,697 - INFO - SparkSession initialization completed in 0.01 seconds


+----------+------------------+------------------+------------------+------------------+------+
|      date|        popularity|          positive|           neutral|          negative|ticker|
+----------+------------------+------------------+------------------+------------------+------+
|2012-04-11|             100.0| 31.57894736842105|21.052631578947366|47.368421052631575|  AAPL|
|2012-04-12|              75.0|               0.0| 66.66666666666666| 33.33333333333333|  AAPL|
|2012-04-13|             100.0|             100.0|               0.0|               0.0|  AAPL|
|2012-04-16|             100.0|              20.0| 56.00000000000001|              24.0|  AAPL|
|2012-04-17|             100.0|              50.0|16.666666666666664| 33.33333333333333|  AAPL|
|2012-04-20|             100.0|               0.0| 66.66666666666666| 33.33333333333333|  AAPL|
|2012-04-22|             100.0|               0.0|             100.0|               0.0|  AAPL|
|2012-04-23| 23.52941176470588|         

In [2]:
ticker = 'TSLA'
spark.read.parquet(f'./stock_sentiments/{ticker}_sentiment').show()

+-------------------+-------------+--------------------+
|       datetime_utc|comment_score|        comment_body|
+-------------------+-------------+--------------------+
|2022-12-21 12:25:36|           -2|Never said he had...|
|2022-12-21 12:25:56|            1|Jim Cramer says s...|
|2022-12-21 12:26:05|           -5|TSLA ![img](emote...|
|2022-12-21 12:26:16|            1|me neither. He is...|
|2022-12-21 12:26:24|            1|The data is comin...|
|2022-12-21 12:27:05|            2|          Not really|
|2022-12-21 12:27:15|            1|What percentage o...|
|2022-12-21 12:27:16|            1|Trump 2.0. Elon g...|
|2022-12-21 12:27:28|            2|             Really?|
|2022-12-21 12:27:34|            5|        Flat as fuck|
|2022-12-21 12:28:07|            2|According to my b...|
|2022-12-21 12:28:31|            2|himm .. that soun...|
|2022-12-21 12:28:39|            1|Fuck off you litt...|
|2022-12-21 12:28:50|            4|More reason for a...|
|2022-12-21 12:29:04|          