In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Base folder where your TSV files live (adjust if different)
base_path = "/Volumes/workspace/food_inspection/raw"

chi_pattern = f"{base_path}/Chicago_*.tsv"
dal_pattern = f"{base_path}/Dallas_*.tsv"


In [0]:
from pyspark.sql import functions as F

# Root folder
raw_path = "/Volumes/workspace/food_inspection/raw/"

# Patterns
chi_pattern = raw_path + "Chicago_*.tsv"
dal_pattern = raw_path + "Dallas_*.tsv"

# Load ALL Chicago
chi_df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("sep", "\t")
    .load(chi_pattern)
    .withColumn("city_code", F.lit("CHI"))
)

# Load ALL Dallas
dal_df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("sep", "\t")
    .load(dal_pattern)
    .withColumn("city_code", F.lit("DAL"))
)

print("Chicago rows:", chi_df.count())
print("Dallas  rows:", dal_df.count())
print("Chicago columns:", len(chi_df.columns))
print("Dallas  columns:", len(dal_df.columns))


Chicago rows: 130462
Dallas  rows: 81772
Chicago columns: 18
Dallas  columns: 115


In [0]:
print("=== Chicago schema ===")
chi_df.printSchema()

print("\n=== Dallas schema ===")
dal_df.printSchema()


=== Chicago schema ===
root
 |-- Inspection ID: string (nullable = true)
 |-- DBA Name: string (nullable = true)
 |-- AKA Name: string (nullable = true)
 |-- License #: string (nullable = true)
 |-- Facility Type: string (nullable = true)
 |-- Risk: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Inspection Date: string (nullable = true)
 |-- Inspection Type: string (nullable = true)
 |-- Results: string (nullable = true)
 |-- Violations: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- city_code: string (nullable = false)


=== Dallas schema ===
root
 |-- Restaurant Name: string (nullable = true)
 |-- Inspection Type: string (nullable = true)
 |-- Inspection Date: string (nullable = true)
 |-- Inspection Score: string (nullable = true)
 |-- Street Number: st

In [0]:
def profile_dataframe(df, df_name, sample_n=1):
    """
    Simple profiling for each column in a Spark DataFrame.
    Returns a new Spark DataFrame with stats.
    """
    total_rows = df.count()
    print(f"\nProfiling {df_name}: {total_rows} rows\n")

    rows = []
    for field in df.schema.fields:
        col_name = field.name
        dtype = field.dataType.simpleString()

        non_null = df.filter(F.col(col_name).isNotNull()).count()
        nulls = total_rows - non_null
        null_pct = (nulls / total_rows) * 100 if total_rows > 0 else 0.0

        distinct = df.select(F.countDistinct(F.col(col_name))).first()[0]

        # Sample one non-null value (if exists)
        sample_val = (
            df.select(col_name)
              .filter(F.col(col_name).isNotNull())
              .limit(sample_n)
              .toPandas()[col_name].tolist()
        )
        sample_val = sample_val[0] if sample_val else None

        rows.append((col_name, dtype, total_rows, non_null, nulls, null_pct, distinct, sample_val))

    schema = T.StructType([
        T.StructField("column", T.StringType(), False),
        T.StructField("dtype", T.StringType(), True),
        T.StructField("total_rows", T.LongType(), True),
        T.StructField("non_null", T.LongType(), True),
        T.StructField("nulls", T.LongType(), True),
        T.StructField("null_pct", T.DoubleType(), True),
        T.StructField("distinct", T.LongType(), True),
        T.StructField("sample_value", T.StringType(), True),
    ])

    profile_df = spark.createDataFrame(rows, schema)
    return profile_df.orderBy(F.desc("null_pct"))


In [0]:
chi_profile = profile_dataframe(chi_df, "Chicago_all_years")
display(chi_profile)



Profiling Chicago_all_years: 130462 rows



column,dtype,total_rows,non_null,nulls,null_pct,distinct,sample_value
Violations,string,130462,90196,40266,30.86415967868038,50222,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING - Comments: OBSERVED NO EMPLOYEE HEALTH POLICY ON SITE. INSTRUCTED TO PROVIDE VERIFICATION FOR EACH EMPLOYEE ON SITE AT ALL TIMES. PRIORITY FOUNDATION. 7-38-010. NO CITATION ISSUED. | 51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICES - Comments: OBSERVED NO COLD RUNNING WATER IN GIRLS WASHROOM NEAR OLD MAIN OFFICE AT HAND WASHING SINK ON LEFT SIDE. MUST PROVIDE HOT AND COLD RUNNING WATER AT ALL TIMES. | 55. PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN - Comments: OBSERVED EXCESSIVE FOOD DEBRIS IN MOP SINK IN KITCHEN CLOSET. MUST CLEAN AND MAINTAIN."
Facility Type,string,130462,129801,661,0.5066609434164737,225,School
Latitude,string,130462,129982,480,0.367923226686698,12635,41.99049312958032
Longitude,string,130462,129985,477,0.3656237065199062,12636,-87.7091921904396
Location,string,130462,129986,476,0.3648571997976422,12637,"(41.990493129580315, -87.7091921904396)"
AKA Name,string,130462,130274,188,0.1441032637856234,14586,JOAN DACH BAIS
City,string,130462,130354,108,0.082782726004507,39,CHICAGO
Risk,string,130462,130419,43,0.03295978905735,4,Risk 3 (Low)
State,string,130462,130427,35,0.0268277352792383,4,IL
Zip,string,130462,130453,9,0.0068985605003755,87,60659.0


In [0]:
dal_profile = profile_dataframe(dal_df, "Dallas_all_years")
display(dal_profile)



Profiling Dallas_all_years: 81772 rows



column,dtype,total_rows,non_null,nulls,null_pct,distinct,sample_value
Violation Description - 25,string,81772,2,81770,99.99755417502324,1,*42 Dirty nonfood contact surfaces
Violation Points - 25,string,81772,2,81770,99.99755417502324,1,1.0
Violation Detail - 25,string,81772,2,81770,99.99755417502324,1,"228.113 Equipment, Utensils, and Linens. Cleaning of equipment and utensils. (3) Nonfood-contact surfaces of equipment shall be kept free of an accumulation of dust, dirt, food residue, and other debris."
Violation Memo - 25,string,81772,2,81770,99.99755417502324,1,"Dirty dust on fans and vents above food prep areas and counters. dirty food containers, outside RIC doors, stoves, ovens, walls, vents, vent filters. Ice on freezer floor."
Violation Description - 24,string,81772,5,81767,99.99388543755808,3,"*46 Water, Plumbing, and Waste Plumbing Systems-good repair"
Violation Points - 24,string,81772,5,81767,99.99388543755808,1,1.0
Violation Detail - 24,string,81772,5,81767,99.99388543755808,3,"228.149 Water, Plumbing, and Waste. Plumbing, operation and maintenance. (e) System maintained in good repair. A plumbing system shall be: (2) maintained in good repair."
Violation Memo - 24,string,81772,5,81767,99.99388543755808,3,water faucets detach from sinks in all areas must be replace or repair to proper condition
Violation Description - 23,string,81772,8,81764,99.99021670009294,6,*10 Equipment and Utensils Cleaning-contamination
Violation Points - 23,string,81772,8,81764,99.99021670009294,3,3.0


In [0]:
chi_cols = set(chi_df.columns)
dal_cols = set(dal_df.columns)

common_cols = sorted(list(chi_cols & dal_cols))
chi_only = sorted(list(chi_cols - dal_cols))
dal_only = sorted(list(dal_cols - chi_cols))

print("=== Common columns (both cities) ===")
for c in common_cols:
    print(c)

print("\n=== Columns only in CHICAGO ===")
for c in chi_only:
    print(c)

print("\n=== Columns only in DALLAS ===")
for c in dal_only[:40]:   # print first 40 only (Dallas has many)
    print(c)
print("... (Dallas has many more violation columns)")


=== Common columns (both cities) ===
Inspection Date
Inspection Type
city_code

=== Columns only in CHICAGO ===
AKA Name
Address
City
DBA Name
Facility Type
Inspection ID
Latitude
License #
Location
Longitude
Results
Risk
State
Violations
Zip

=== Columns only in DALLAS ===
Inspection Month
Inspection Score
Inspection Year
Lat Long Location
Restaurant Name
Street Address
Street Direction
Street Name
Street Number
Street Type
Street Unit
Violation  Memo - 20
Violation Description - 1
Violation Description - 10
Violation Description - 11
Violation Description - 12
Violation Description - 13
Violation Description - 14
Violation Description - 15
Violation Description - 16
Violation Description - 17
Violation Description - 18
Violation Description - 19
Violation Description - 2
Violation Description - 20
Violation Description - 21
Violation Description - 22
Violation Description - 23
Violation Description - 24
Violation Description - 25
Violation Description - 3
Violation Description - 4
Vi

In [0]:
chi_profile.write.mode("overwrite").saveAsTable("food_inspection.chicago_profile")
dal_profile.write.mode("overwrite").saveAsTable("food_inspection.dallas_profile")


In [0]:
from pyspark.sql import functions as F

def add_chicago_score(df):
    return df.withColumn(
        "inspection_score",
        F.when(F.col("Results") == "Pass", F.lit(90))
         .when(F.col("Results") == "Pass w/ Conditions", F.lit(80))
         .when(F.col("Results") == "Fail", F.lit(70))
         .when(F.col("Results") == "No Entry", F.lit(0))
         .otherwise(F.lit(None).cast("int"))
    )


In [0]:
# Start from chi_df we already loaded
chi_clean = add_chicago_score(chi_df)

chi_silver = (
    chi_clean
    # basic ids
    .withColumn("inspection_id", F.col("Inspection ID"))
    .withColumn("business_name", F.upper(F.col("DBA Name")))
    .withColumn("aka_name", F.upper(F.col("AKA Name")))
    .withColumn("license_number", F.col("License #").cast("string"))
    .withColumn("street_address", F.col("Address"))
    .withColumn("city", F.col("City"))
    .withColumn("state", F.col("State"))
    .withColumn("zip", F.lpad(F.col("Zip").cast("string"), 5, "0"))
    # dates
    .withColumn("inspection_date", F.to_date("Inspection Date"))
    .withColumn("inspection_year", F.year("inspection_date"))
    .withColumn("inspection_month", F.month("inspection_date"))
    # inspection attrs
    .withColumn("inspection_type", F.col("Inspection Type"))
    .withColumn("inspection_result", F.col("Results"))
    .withColumn("facility_type", F.col("Facility Type"))
    .withColumn("risk_category", F.col("Risk"))
.withColumn("latitude", F.expr("try_cast(Latitude as double)"))
.withColumn("longitude", F.expr("try_cast(Longitude as double)"))

    # city_code already exists on chi_df from earlier
    .select(
        "inspection_id",
        "city_code",
        "business_name",
        "aka_name",
        "license_number",
        "street_address",
        "city",
        "state",
        "zip",
        "inspection_date",
        "inspection_year",
        "inspection_month",
        "inspection_type",
        "inspection_result",
        "inspection_score",
        "facility_type",
        "risk_category",
        "latitude",
        "longitude",
    )
)


In [0]:
from pyspark.sql import functions as F

dal_silver = (
    dal_df
    .withColumn("inspection_id", F.monotonically_increasing_id().cast("string"))
    .withColumn("business_name", F.upper(F.col("Restaurant Name")))
    .withColumn("aka_name", F.lit(None).cast("string"))
    .withColumn("license_number", F.lit(None).cast("string"))
    .withColumn("street_address", F.col("Street Address"))
    .withColumn("city", F.lit("DALLAS"))
    .withColumn("state", F.lit("TX"))
    .withColumn("zip", F.regexp_extract(F.col("Zip Code"), r"(\d{5})", 1))
    .withColumn("inspection_date", F.to_date("Inspection Date"))
    .withColumn("inspection_year", F.year("inspection_date"))
    .withColumn("inspection_month", F.month("inspection_date"))
    .withColumn("inspection_type", F.col("Inspection Type"))
    .withColumn(
        "inspection_result",
        F.when(F.col("Inspection Score") >= 90, F.lit("Pass"))
         .when(F.col("Inspection Score") >= 80, F.lit("Pass w/ Conditions"))
         .otherwise(F.lit("Fail"))
    )
    .withColumn("inspection_score", F.col("Inspection Score").cast("int"))
    .withColumn("facility_type", F.lit(None).cast("string"))
    .withColumn("risk_category", F.lit(None).cast("string"))
    # SAFE latitude/longitude using try_cast + get() to avoid out-of-bounds
    .withColumn(
        "latitude",
        F.expr("try_cast(get(split(`Lat Long Location`, ','), 0) as double)")
    )
    .withColumn(
        "longitude",
        F.expr("try_cast(get(split(`Lat Long Location`, ','), 1) as double)")
    )
    .select(
        "inspection_id",
        "city_code",
        "business_name",
        "aka_name",
        "license_number",
        "street_address",
        "city",
        "state",
        "zip",
        "inspection_date",
        "inspection_year",
        "inspection_month",
        "inspection_type",
        "inspection_result",
        "inspection_score",
        "facility_type",
        "risk_category",
        "latitude",
        "longitude",
    )
)


In [0]:
required_cols = ["business_name", "inspection_date", "inspection_type", "zip"]

def enforce_required(df, label):
    cond = None
    for c in required_cols:
        ccond = F.col(c).isNotNull()
        cond = ccond if cond is None else (cond & ccond)
    bad_count = df.filter(~cond).count()
    print(f"{label}: dropping {bad_count} rows that violate NOT NULL rules")
    return df.filter(cond)

chi_silver_valid = enforce_required(chi_silver, "Chicago")
dal_silver_valid = enforce_required(dal_silver, "Dallas")

silver_inspection = chi_silver_valid.unionByName(dal_silver_valid)
print("Unified silver rows:", silver_inspection.count())


Chicago: dropping 9 rows that violate NOT NULL rules
Dallas: dropping 34553 rows that violate NOT NULL rules
Unified silver rows: 177672


In [0]:
chi_silver_valid = enforce_required(chi_silver, "Chicago")
dal_silver_valid = enforce_required(dal_silver, "Dallas")

silver_inspection = chi_silver_valid.unionByName(dal_silver_valid)
print("Unified silver rows:", silver_inspection.count())


Chicago: dropping 9 rows that violate NOT NULL rules
Dallas: dropping 34553 rows that violate NOT NULL rules
Unified silver rows: 177672


In [0]:
silver_inspection.write.mode("overwrite").saveAsTable("food_inspection.silver_inspection")
