In [6]:
import os
import re
import shutil
import logging
from datetime import datetime
from pyspark.sql import SparkSession, functions as F, Window


"""
ETL application for counting impressions and clicks by date/hour for a specific user agent.
Uses PySpark, environment variables (DB credentials, etc.), and writes CSV outputs.
Follows best practices: DRY, PEP-8, logging, error handling, missing data handling.
"""


# Environment variables (example usage, no hardcoded credentials)
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_USER = os.getenv("DB_USER", "user")
DB_PASSWORD = os.getenv("DB_PASSWORD", "password")

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

def parse_filename_datetime(filename: str) -> str:
    """
    From a filename like 'impressions_processed_dk_20220526113212045_172845633-172845636_1.parquet',
    extract the date/time in the format '2022-05-26 11:32'.
    """
    match = re.search(r'_dk_(\d{8})(\d{4})', filename)
    if not match:
        return None
    date_str, time_str = match.group(1), match.group(2)
    parsed_dt = datetime.strptime(date_str + time_str, "%Y%m%d%H%M")
    return parsed_dt.strftime("%Y-%m-%d %H:%M")

def create_date_hour_df(spark, date_str):
    """
    Create a Spark DataFrame with all hours of the given date (00-23) to ensure no missing hours.
    """
    base_date = datetime.strptime(date_str, "%Y-%m-%d")
    rows = [(base_date.strftime("%Y-%m-%d"), h) for h in range(24)]
    return spark.createDataFrame(rows, ["date", "hour"])

def main():
    spark = SparkSession.builder.appName("ImpressionsClicksETL").getOrCreate()

    # Example: input directory, output directory
    input_dir = "./input_parquet"
    output_dir = "./output_csv"
    target_user_agent = "some user agent"

    # Force creation of the directory if it doesn't exist
    os.makedirs(input_dir, exist_ok=True)

    # Process all parquet files in the input directory
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".parquet"):
            file_path = os.path.join(input_dir, file_name)

            # Parse date/time from filename
            date_time_str = parse_filename_datetime(file_name)
            if not date_time_str:
                logging.warning("Skipping file %s, unable to parse date/time.", file_name)
                continue

            # Read parquet
            df = spark.read.parquet(file_path)

            # Extract date/hour from the filename’s date/time
            extracted_date = date_time_str.split(" ")[0]

            # Filter records by our target user agent
            filtered_df = df.filter(F.col("device_settings.user_agent") == target_user_agent)

            # Suppose impressions/clicks are columns: "impressions", "clicks"
            # Derive hour from the processed date_time_str (for merging with the hour dimension)
            hour_val = int(date_time_str.split(" ")[1].split(":")[0])
            aggregated_df = filtered_df.agg(
                F.sum("impressions").alias("impressions_sum"),
                F.sum("clicks").alias("clicks_sum")
            ).withColumn("date", F.lit(extracted_date)) \
             .withColumn("hour", F.lit(hour_val))

            # Create a full date-hour DataFrame (00-23) for the same date
            date_hour_df = create_date_hour_df(spark, extracted_date)

            # Join aggregated_df to date_hour_df to fill missing hours with zeros
            joined_df = date_hour_df.join(
                aggregated_df,
                on=["date", "hour"],
                how="left"
            ).na.fill({"impressions_sum": 0, "clicks_sum": 0})

            # Rename columns to match the required output
            result_df = joined_df.select(
                "date",
                "hour",
                F.col("impressions_sum").alias("impression_count"),
                F.col("clicks_sum").alias("click_count")
            )

            # Write to CSV
            output_file = os.path.join(output_dir, f"impressions_clicks_{extracted_date}.csv")
            result_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_file)
            logging.info("Processed file %s -> %s", file_path, output_file)

            # Remove input file
            os.remove(file_path)
            logging.info("Removed input file: %s", file_path)

    spark.stop()

if __name__ == "__main__":
    main()

In [19]:
import os
import psycopg2
from psycopg2 import OperationalError

try:
    conn = psycopg2.connect(
        dbname="adform_db",  # default database name
        user="adform_user",    # using the correct username
        password="adform_pass", # replace with your actual password
        host="localhost",
        port="5433"
    )
    print("Connected OK!")
    conn.close()
except OperationalError as e:
    print(f"Error connecting to database: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


Connected OK!
