In [0]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/talati.ak@northeastern.edu/Chicago_profiling.csv")

In [0]:
print("Schema:")
df1.printSchema()

Schema:
root
 |-- Inspection ID: string (nullable = true)
 |-- DBA Name: string (nullable = true)
 |-- AKA Name: string (nullable = true)
 |-- License #: string (nullable = true)
 |-- Facility Type: string (nullable = true)
 |-- Risk: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zip: string (nullable = true)
 |-- Inspection Date: string (nullable = true)
 |-- Inspection Type: string (nullable = true)
 |-- Results: string (nullable = true)
 |-- Violations: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [0]:
print("Total Rows:", df1.count())
print("Total Columns:", len(df1.columns))

Total Rows: 130462
Total Columns: 17


In [0]:
df1.show(5)

+-------------+--------------------+--------------------+---------+-------------+-------------+--------------------+-------+-----+-------+---------------+---------------+------------------+--------------------+--------------------+--------------------+--------------------+
|Inspection ID|            DBA Name|            AKA Name|License #|Facility Type|         Risk|             Address|   City|State|    Zip|Inspection Date|Inspection Type|           Results|          Violations|            Latitude|           Longitude|            Location|
+-------------+--------------------+--------------------+---------+-------------+-------------+--------------------+-------+-----+-------+---------------+---------------+------------------+--------------------+--------------------+--------------------+--------------------+
|      2485006|     JOAN DACHS BAIS|      JOAN DACH BAIS|2225390.0|       School| Risk 3 (Low)| 3200 W PETERSON AVE|CHICAGO|   IL|60659.0|     2021-02-22|        Canvass|Pass w/ 

In [0]:
#checking for misssing values
from pyspark.sql.functions import col, sum as _sum, when

print("Missing Value Count:")
null_counts = df1.select([_sum(when(col(c).isNull() | (col(c) == ""), 1).otherwise(0)).alias(c) for c in df1.columns])
null_counts.show()

Missing Value Count:
+-------------+--------+--------+---------+-------------+----+-------+----+-----+---+---------------+---------------+-------+----------+--------+---------+--------+
|Inspection ID|DBA Name|AKA Name|License #|Facility Type|Risk|Address|City|State|Zip|Inspection Date|Inspection Type|Results|Violations|Latitude|Longitude|Location|
+-------------+--------+--------+---------+-------------+----+-------+----+-----+---+---------------+---------------+-------+----------+--------+---------+--------+
|            0|       0|     188|        4|          661|  43|      0| 108|   35|  9|              0|              0|      0|     40266|     457|      466|     468|
+-------------+--------+--------+---------+-------------+----+-------+----+-----+---+---------------+---------------+-------+----------+--------+---------+--------+



In [0]:
# Distinct values
print("Cardinality (Distinct values per column):")
for column in df1.columns:
    print(f"{column}: {df1.select(column).distinct().count()}")


Cardinality (Distinct values per column):
Inspection ID: 73165
DBA Name: 15191
AKA Name: 14587
License #: 19418
Facility Type: 226
Risk: 5
Address: 12777
City: 40
State: 5
Zip: 88
Inspection Date: 1037
Inspection Type: 17
Results: 7
Violations: 50106
Latitude: 14222
Longitude: 15038
Location: 14952


In [0]:
print("Summary Statistics:")
numeric_cols = [f.name for f in df1.schema.fields if str(f.dataType) in ['IntegerType', 'DoubleType', 'LongType', 'FloatType']]
df1.select(numeric_cols).describe().show()

Summary Statistics:
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+



In [0]:
print("Top Values in Categorical Columns:")
cat_cols = [f.name for f in df1.schema.fields if f.dataType.simpleString() == 'string']
for col_name in cat_cols:
    df1.groupBy(col_name).count().orderBy("count", ascending=False).show(5, truncate=False)


Top Values in Categorical Columns:
+-------------+-----+
|Inspection ID|count|
+-------------+-----+
|2564759      |2    |
|2567727      |2    |
|2559471      |2    |
|2546176      |2    |
|2568702      |2    |
+-------------+-----+
only showing top 5 rows

+-------------+-----+
|DBA Name     |count|
+-------------+-----+
|SUBWAY       |1594 |
|DUNKIN DONUTS|752  |
|MCDONALD'S   |334  |
|TACO BELL    |207  |
|WINGSTOP     |201  |
+-------------+-----+
only showing top 5 rows

+-------------+-----+
|AKA Name     |count|
+-------------+-----+
|SUBWAY       |1946 |
|DUNKIN DONUTS|707  |
|MCDONALD'S   |416  |
|7-ELEVEN     |388  |
|WENDY'S      |338  |
+-------------+-----+
only showing top 5 rows

+---------+-----+
|License #|count|
+---------+-----+
|0.0      |352  |
|2594606.0|48   |
|2470443.0|44   |
|2594633.0|44   |
|2163775.0|41   |
+---------+-----+
only showing top 5 rows

+-------------------------------+-----+
|Facility Type                  |count|
+----------------------------

In [0]:
print("Duplicate Rows:")
df1.groupBy(df1.columns).count().filter("count > 1").show()

Duplicate Rows:
+-------------+--------------------+--------------------+---------+-------------+---------------+--------------------+-------+-----+-------+---------------+--------------------+------------------+--------------------+------------------+--------------------+--------------------+-----+
|Inspection ID|            DBA Name|            AKA Name|License #|Facility Type|           Risk|             Address|   City|State|    Zip|Inspection Date|     Inspection Type|           Results|          Violations|          Latitude|           Longitude|            Location|count|
+-------------+--------------------+--------------------+---------+-------------+---------------+--------------------+-------+-----+-------+---------------+--------------------+------------------+--------------------+------------------+--------------------+--------------------+-----+
|      2546105|UNCLE JOES JERK C...|UNCLE JOES JERK C...|2684579.0|   Restaurant|  Risk 1 (High)|       205 W 87TH ST|CHICAGO|   