In [95]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col, count, expr, trim, when


spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

csvFilePath = './Businesses_Registered_with_EBR_Parish_20240221.csv'
dfBusiness = spark.read.csv(csvFilePath, header=True, inferSchema=True)

csvBusinessLocationPath = './Street_Address_20240227.csv'
dfBusinessAddress = spark.read.csv(csvBusinessLocationPath, header=True, inferSchema=True)

csvFilePathAddress = './Street_Address_Listing_20240226.csv'
dfAddress = spark.read.csv(csvFilePathAddress, header=True, inferSchema=True)

                                                                                

In [96]:
dfBusinessAddress.printSchema()

root
 |-- BUSINESS NAME: string (nullable = true)
 |-- STREET ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- ZIP CODE: integer (nullable = true)
 |-- PHONE NUMBER: string (nullable = true)
 |-- WEBSITE: string (nullable = true)
 |-- RESOURCE TYPE: string (nullable = true)
 |-- SUB-RESOURCE TYPE: string (nullable = true)
 |-- SHELTER LOCATION: string (nullable = true)
 |-- ADDRESS AUTHORITY: string (nullable = true)
 |-- SUBDIVISION: string (nullable = true)
 |-- NAICS CODE: integer (nullable = true)
 |-- SERVICE TYPE: string (nullable = true)
 |-- SERVICE NAME: string (nullable = true)
 |-- STREET PREFIX DIRECTION: string (nullable = true)
 |-- STREET NAME: string (nullable = true)
 |-- STREET SUFFIX TYPE: string (nullable = true)
 |-- STREET SUFFIX DIRECTION: string (nullable = true)
 |-- STREET EXTENSION: string (nullable = true)
 |-- PRIVATE STREET: string (nullable = true)
 |-- WARD NUMBER: integer (nullable = true)
 |-- TAX SECTION: integer (nullable = 

In [108]:
# Filter out rows that are not in Baton Rouge, LA
filteredDfBusiness = dfBusiness.filter(
    (col("PHYSICAL ADDRESS - CITY") == "BATON ROUGE") &
    (col("PHYSICAL ADDRESS - STATE") == "LA")
)

dfBusinessAddress = dfBusinessAddress.filter(
    (col("BUSINESS NAME") != "")
)
print(dfBusinessAddress.count())


# .join(
#         dfAddress,
#         filteredDfBusiness["ADDRESS ID"] == dfAddress["ADDRESS NO"],
#         "inner"
#     )
joined_df = filteredDfBusiness.join(
        dfBusinessAddress,
        filteredDfBusiness["ACCOUNT NAME"] == dfBusinessAddress["BUSINESS NAME"],
        "inner"
    ).select(
        col("LEGAL NAME").alias("BUSINESS NAME"),
        col("BUSINESS OPEN DATE"),
        col("BUSINESS STATUS"),
        col("BUSINESS CLOSE DATE"),
        filteredDfBusiness["NAICS CODE"],
        filteredDfBusiness["NAICS CATEGORY"],
        filteredDfBusiness["NAICS GROUP"],
        col("FULL STREET NAME"),
        col("PHYSICAL ADDRESS - CITY").alias("CITY"),
        col("PHYSICAL ADDRESS - STATE").alias("STATE"),
        col("GEOMETRY"),
    )

# Concatenate the values of the existing columns separated by space
# joined_df = joined_df.withColumn("FULL STREET NAME", 
#                                 concat_ws(" ", 
#                                           *[trim(col(c)) for c in ["STREET PREFIX DIRECTION", 
#                                                                    "STREET PREFIX TYPE", 
#                                                                    "STREET NAME", 
#                                                                    "STREET SUFFIX TYPE", 
#                                                                    "STREET SUFFIX DIRECTION"]]))
# joined_df = joined_df.where(joined_df["FULL STREET NAME"] == "BELLACOSA AVE")


print(filteredDfBusiness.count())
print(joined_df.count())
print(joined_df.printSchema())

24006
46096
17407
root
 |-- BUSINESS NAME: string (nullable = true)
 |-- BUSINESS OPEN DATE: string (nullable = true)
 |-- BUSINESS STATUS: string (nullable = true)
 |-- BUSINESS CLOSE DATE: string (nullable = true)
 |-- NAICS CODE: integer (nullable = true)
 |-- NAICS CATEGORY: string (nullable = true)
 |-- NAICS GROUP: string (nullable = true)
 |-- FULL STREET NAME: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- GEOMETRY: string (nullable = true)

None


In [105]:
# Group by NAICS CATEGORY, PHYSICAL ADDRESS - CITY, and PHYSICAL ADDRESS - STATE and count occurrences
# distribution = dfNew.groupBy("NAICS CATEGORY", "PHYSICAL ADDRESS - CITY", "PHYSICAL ADDRESS - STATE").count()
# distributionByState = joined_df.groupBy(
#     "FULL STREET NAME",
#     "NAICS CATEGORY",
# ).agg(count("*").alias("COUNT"))

distributionByState = joined_df.groupBy(
    "FULL STREET NAME",
    "NAICS CATEGORY",
).agg(
    count("*").alias("TOTAL COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'O' then 1 else 0 end)").alias("ACTIVE COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'C' then 1 else 0 end)").alias("CLOSED COUNT")
)


In [106]:
# print(distribution.show())
print(distributionByState.show(100))



+--------------------+--------------------+-----------+------------+------------+
|    FULL STREET NAME|      NAICS CATEGORY|TOTAL COUNT|ACTIVE COUNT|CLOSED COUNT|
+--------------------+--------------------+-----------+------------+------------+
|            CADDO ST|Personal & Laundr...|          1|           1|           0|
|  BELLE FOUNTAINE CT|Miscellaneous Oth...|          2|           2|           0|
|      ONE CALAIS AVE|Ambulatory Health...|          2|           2|           0|
|          NORTH BLVD|Administrative & ...|          2|           2|           0|
|         MALLORY AVE|  Nonstore Retailers|          1|           1|           0|
|  BAYOU FOUNTAIN AVE|Specialty Trade C...|          4|           4|           0|
|         HIGHLAND RD|Professional, Sci...|         34|          20|          14|
|     OLD HAMMOND HWY|  Nonstore Retailers|          7|           7|           0|
|       OAK ARBOR AVE|Miscellaneous Oth...|          1|           1|           0|
|GENERAL BEAUREG

                                                                                

In [4]:
# Optionally, you can aggregate the counts by state or city to get a broader view of the distribution
state_distribution = distribution.groupBy("PHYSICAL ADDRESS - STATE", "NAICS CATEGORY").sum("count")
city_distribution = distribution.groupBy("PHYSICAL ADDRESS - CITY", "NAICS CATEGORY").sum("count")
