Code to clean up raw data from camel case to snake case

In [4]:
import os, sys, re, json
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession

# ensure Python child processes use the same interpreter
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# stop any old session
try: spark.stop()
except: pass


spark = (
    SparkSession.builder
        .appName("CleanRawMatchData")
        .config("spark.driver.memory", "4g")
        .config("spark.sql.adaptive.enabled", "false")
        .getOrCreate()
)


print("Spark up:", spark.range(1).count())

INPUT_PATH_SMALL = "C:/Users/17862/Desktop/SnexCode/lol-helper/data_processing/camel_to_snake/small_csv_batches"
OUTPUT_PATH_SMALL = "C:/Users/17862/Desktop/SnexCode/lol-helper/data_processing/camel_to_snake/small_snake_csv"

INPUT_PATH_LARGE = "C:/Users/17862/Desktop/SnexCode/lol-helper/data_processing/camel_to_snake/large_csv_batches"
OUTPUT_PATH_LARGE = "C:/Users/17862/Desktop/SnexCode/lol-helper/data_processing/camel_to_snake/large_snake_csv"

Spark up: 1


In [None]:
# ———————————————
# 1) UDF to snake-case nested JSON keys
def camel_to_snake(name:str)->str:
    s1 = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', name)
    s2 = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s1)
    return s2.lower()

def rename_keys(obj):
    if isinstance(obj, dict):
        return { camel_to_snake(k): rename_keys(v)
                 for k,v in obj.items() }
    elif isinstance(obj, list):
        return [rename_keys(x) for x in obj]
    else:
        return obj

def safe_snake_json(s: str) -> str:
    if not s or not s.strip():
        return None
    try:
        parsed = json.loads(s)
    except json.JSONDecodeError:
        # either log.warn here or just return None / original
        return None
    return json.dumps(rename_keys(parsed))

snake_json = udf(safe_snake_json, StringType())

# ———————————————
# 2) Read all your CSVs from the folder (preserving all columns)
df_raw = (
    spark.read
         .option("header", True)
         .option("multiLine", True)
         .option("escape", '"')
         .option("quote", '"')
         .csv(INPUT_PATH_LARGE)
)

# ———————————————
# 3) Apply the UDF in-place, on the matchData column
df_clean = df_raw.withColumn("matchData", snake_json(col("matchData")))

# ———————————————
# 4) (Optional) rename into snake_case for the column names themselves
df_clean = (
    df_clean
      .withColumnRenamed("summonerId",  "summoner_id")
      .withColumnRenamed("puuid",       "puuid")
      .withColumnRenamed("matchId",     "match_id")
      .withColumnRenamed("matchData",   "match_data")
)

# ———————————————
# 5) Write out as a proper CSV with header—this will preserve all four columns
(
    df_clean
        .coalesce(1)                      # optional: emit a single output file
        .write
        .mode("overwrite")                # overwrite just this folder
        .option("header", True)           # include column names
        .option("quote", "\"")            # use " as the quoting char
        .option("escape", "\"")           # escape internal " by doubling them
        .option("quoteAll", True)         # quote every field (safer for JSON blobs)
        .csv(OUTPUT_PATH_LARGE)
)