In [1]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import expr, concat
# import findspark
# import logging
# import time

# findspark.init()

# # Setup basic configuration for logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# def log_time_taken(start, operation):
#     end = time.time()
#     logger.info(f"{operation} completed in {end - start:.2f} seconds")

# # Start timing and log the initialization of the Spark session
# logger.info("Initializing Spark session with optimized memory settings")
# start_time = time.time()
# spark = SparkSession.builder \
#     .appName("Reddit Comment Context Builder") \
#     .master("local[*]")  \
#     .config("spark.executor.memory", "64g")  \
#     .config("spark.driver.memory", "32g")  \
#     .config("spark.executor.memoryOverhead", "4096") \
#     .config("spark.driver.memoryOverhead", "2048")  \
#     .config("spark.driver.maxResultSize", "8g") \
#     .config("spark.driver.extraClassPath", "/Volumes/LaCie/wsb_archive/postgresql-42.7.3.jar") \
#     .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
#     .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
#     .getOrCreate()
# log_time_taken(start_time, "SparkSession initialization")

## Another set of spark configuration

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, concat
import findspark
import logging
import time

findspark.init()

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def log_time_taken(start, operation):
    end = time.time()
    logger.info(f"{operation} completed in {end - start:.2f} seconds")

# Start timing and log the initialization of the Spark session
logger.info("Initializing Spark session with optimized memory settings")
start_time = time.time()
spark = SparkSession.builder \
    .appName("Reddit Comment Context Builder") \
    .master("local[*]")  \
    .config("spark.executor.memory", "16g")  \
    .config("spark.driver.memory", "8g")  \
    .config("spark.executor.memoryOverhead", "4096") \
    .config("spark.driver.memoryOverhead", "2048")  \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.driver.extraClassPath", "/Volumes/LaCie/wsb_archive/postgresql-42.7.3.jar") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
log_time_taken(start_time, "SparkSession initialization")

2024-04-04 12:05:24,371 - INFO - Initializing Spark session with optimized memory settings
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/04 12:05:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


2024-04-04 12:05:26,676 - INFO - SparkSession initialization completed in 2.30 seconds


In [3]:
wsb_comments_with_context = spark.read.parquet("./wsb_comments_with_context")

In [4]:
import yfinance as yf
import re

class CompanyNameSimplifier:
    def __init__(self):
        self.suffixes = [
            'Inc.', 'Inc', 'Corporation', 'Corp.', 'Corp', 'Company', 'Co.', 'Co', 
            'Limited', 'Ltd.', 'Ltd', ' PLC', ' NV', ' SA', ' AG', ' LLC', ' L.P.', ' LP'
        ]
        # Adjusted to remove web domains in any part of the name before comma, period, or space
        self.web_domains_regex = r'\.com|\.org|\.net|\.io|\.co|\.ai'

    def simplify_company_name(self, name):
        # Remove web domain suffixes using regular expression first
        name = re.sub(self.web_domains_regex, '', name, flags=re.IGNORECASE)

        # Remove any company suffix from the list
        for suffix in self.suffixes:
            if name.endswith(suffix):
                name = name.replace(suffix, '')
                break
        
        # Additional cleanup: remove anything after a comma or dash
        name = re.split(',| -', name)[0]

        # Strip leading and trailing whitespace
        name = name.strip()

        return name

    def get_simplified_company_name(self, ticker):
        # Fetch the company info using yfinance
        company = yf.Ticker(ticker)
        company_info = company.info
        
        # Extract the long name
        full_name = company_info.get('longName', '')
        
        # Simplify the name
        simple_name = self.simplify_company_name(full_name)
        
        return simple_name

2024-04-04 12:05:29,402 - INFO - Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-04-04 12:05:29,403 - INFO - NumExpr defaulting to 8 threads.


In [5]:
from pyspark.sql.functions import col, lower

In [6]:
def filter_comments_by_ticker(df, ticker):
    simplifier = CompanyNameSimplifier()
    # Obtain the simplified company name for the given ticker
    company_name = simplifier.get_simplified_company_name(ticker)
    
    # Convert the ticker and company name to lowercase for a case-insensitive search
    ticker_lower = ticker.lower()
    company_name_lower = company_name.lower()

    # Filter the DataFrame for rows where the `comment_context` contains the ticker or the company name
    # Uses `lower` function to ensure that the search is case-insensitive
    filtered_df = df.filter(
        lower(col("comment_context")).contains(ticker_lower) | 
        lower(col("comment_context")).contains(company_name_lower)
    ).select("datetime_utc", "comment_score", "comment_body")

    return filtered_df

In [7]:
ticker_symbols = ['NVDA', 'TSLA']

In [8]:
for ticker in ticker_symbols:
    comments = filter_comments_by_ticker(wsb_comments_with_context, ticker)
    comments.write.parquet(f"./stock_comments/{ticker}_comments")

                                                                                