In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, concat
import findspark
import logging
import time

findspark.init()

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def log_time_taken(start, operation):
    end = time.time()
    logger.info(f"{operation} completed in {end - start:.2f} seconds")

# Start timing and log the initialization of the Spark session
logger.info("Initializing Spark session with optimized memory settings")
start_time = time.time()
spark = SparkSession.builder \
    .appName("Optimized Postgres Integration") \
    .master("local[*]")  \
    .config("spark.executor.memory", "64g")  \
    .config("spark.driver.memory", "32g")  \
    .config("spark.executor.memoryOverhead", "4096") \
    .config("spark.driver.memoryOverhead", "2048")  \
    .config("spark.driver.extraClassPath", "/Volumes/LaCie/wsb_archive/postgresql-42.7.3.jar") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
log_time_taken(start_time, "SparkSession initialization")

# Function to read data from PostgreSQL using JDBC with partitioning
def read_jdbc_table(url, table, user, password, partitionColumn, lowerBound, upperBound, numPartitions):
    return spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table) \
        .option("user", user) \
        .option("password", password) \
        .option("partitionColumn", partitionColumn) \
        .option("lowerBound", lowerBound) \
        .option("upperBound", upperBound) \
        .option("numPartitions", numPartitions) \
        .load()


# Load comments with partitioning for better performance
logger.info("Starting to load comments")
comments_df = read_jdbc_table(
    url="jdbc:postgresql://localhost:5432/subreddit_wsb_data",
    table="wsb_comments",
    user="postgres",
    password="trust",
    partitionColumn="datetime_utc",  # comment_id is a numeric or can be hashed to a numeric range
    lowerBound="2009-01-01 00:00:00",
    upperBound="2024-02-01 00:00:00",  # Adjust upperBound to be slightly above the highest expected value
    numPartitions=200  # Adjust based on your cluster's capacity and the data distribution
)
comments_df.write.parquet("wsb_comments")


# Load submissions with partitioning
logger.info("Starting to load submissions")
submissions_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/subreddit_wsb_data") \
    .option("dbtable", "wsb_submissions") \
    .option("user", "postgres") \
    .option("password", "trust") \
    .option("driver", "org.postgresql.Driver") \
    .option("partitionColumn", "datetime_utc") \
    .option("lowerBound", "2009-01-01 00:00:00") \
    .option("upperBound", "2024-02-01 00:00:00") \
    .option("numPartitions", 200) \
    .load()
logger.info("Submissions loaded with partitioning")
submissions_df.write.parquet("wsb_submissions")