# 01 â€“ Data Overview (PySpark)
This notebook loads the CTR dataset using PySpark and validates file paths.

In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("CTR_Data_Overview")
        .config("spark.sql.shuffle.partitions", "200")
        .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
print("Spark version:", spark.version)


In [None]:
import os

raw_dir = os.path.join("data", "raw")

files = {
    "user_profile": "user_profile.csv",
    "ad_feature": "ad_feature.csv",
    "raw_sample": "raw_sample.csv",
    "behavior_log": "behavior_log.csv"
}

for name, fname in files.items():
    path = os.path.join(raw_dir, fname)
    print(name, path, os.path.exists(path))


In [None]:
# Load only if files exist
dfs = {}
for name, fname in files.items():
    path = os.path.join("data", "raw", fname)
    if os.path.exists(path):
        dfs[name] = spark.read.csv(path, header=True, inferSchema=True).limit(1_000_000)
        print(f"{name} loaded:", dfs[name].count(), "rows")
    else:
        print(f"Missing file: {fname}")


In [None]:
# Print schema for loaded dataframes
for name, df in dfs.items():
    print(f"Schema for {name}:")
    df.printSchema()
