In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col, count, expr, to_date


spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

csvFilePath = './Businesses_Registered_with_EBR_Parish_20240221.csv'
dfBusiness = spark.read.csv(csvFilePath, header=True, inferSchema=True)

csvBusinessLocationPath = './Street_Address_20240227.csv'
dfBusinessAddress = spark.read.csv(csvBusinessLocationPath, header=True, inferSchema=True)

# csvFilePathAddress = './Street_Address_Listing_20240226.csv'
# dfAddress = spark.read.csv(csvFilePathAddress, header=True, inferSchema=True)

In [12]:
# Filter out rows that are not in Baton Rouge, LA
filteredDfBusiness = dfBusiness.filter(
    (col("PHYSICAL ADDRESS - CITY") == "BATON ROUGE") &
    (col("PHYSICAL ADDRESS - STATE") == "LA")
)

dfBusinessAddress = dfBusinessAddress.filter(
    (col("BUSINESS NAME") != "")
)


joined_df = filteredDfBusiness.join(
        dfBusinessAddress,
        filteredDfBusiness["ACCOUNT NAME"] == dfBusinessAddress["BUSINESS NAME"],
        "inner"
    ).select(
        col("LEGAL NAME").alias("BUSINESS NAME"),
        col("BUSINESS OPEN DATE"),
        col("BUSINESS STATUS"),
        col("BUSINESS CLOSE DATE"),
        filteredDfBusiness["NAICS CODE"],
        filteredDfBusiness["NAICS CATEGORY"],
        filteredDfBusiness["NAICS GROUP"],
        col("FULL STREET NAME"),
        col("PHYSICAL ADDRESS - CITY").alias("CITY"),
        col("PHYSICAL ADDRESS - STATE").alias("STATE"),
        col("GEOMETRY"),
    )

distinct_categories = joined_df.select(col("NAICS CATEGORY")).distinct()
distinct_groups = joined_df.select(col("NAICS GROUP")).distinct()

print(distinct_categories.count())
print(distinct_groups.count())

print(joined_df.count())

60
19
17407


In [7]:
# Group by NAICS CATEGORY, PHYSICAL ADDRESS - CITY, and PHYSICAL ADDRESS - STATE and count occurrences
# distribution = dfNew.groupBy("NAICS CATEGORY", "PHYSICAL ADDRESS - CITY", "PHYSICAL ADDRESS - STATE").count()
# distributionByState = joined_df.groupBy(
#     "FULL STREET NAME",
#     "NAICS CATEGORY",
# ).agg(count("*").alias("COUNT"))

distributionByState = joined_df.groupBy(
    "FULL STREET NAME",
    "NAICS CATEGORY",
).agg(
    count("*").alias("TOTAL COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'O' then 1 else 0 end)").alias("ACTIVE COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'C' then 1 else 0 end)").alias("CLOSED COUNT")
)


In [8]:
# print(distribution.show())
print(distributionByState.show(100))

[Stage 77:>                                                         (0 + 7) / 7]

+--------------------+--------------------+-----------+------------+------------+
|    FULL STREET NAME|      NAICS CATEGORY|TOTAL COUNT|ACTIVE COUNT|CLOSED COUNT|
+--------------------+--------------------+-----------+------------+------------+
|            CADDO ST|Personal & Laundr...|          1|           1|           0|
|  BELLE FOUNTAINE CT|Miscellaneous Oth...|          2|           2|           0|
|      ONE CALAIS AVE|Ambulatory Health...|          2|           2|           0|
|          NORTH BLVD|Administrative & ...|          2|           2|           0|
|         MALLORY AVE|  Nonstore Retailers|          1|           1|           0|
|  BAYOU FOUNTAIN AVE|Specialty Trade C...|          4|           4|           0|
|         HIGHLAND RD|Professional, Sci...|         34|          20|          14|
|     OLD HAMMOND HWY|  Nonstore Retailers|          7|           7|           0|
|       OAK ARBOR AVE|Miscellaneous Oth...|          1|           1|           0|
|GENERAL BEAUREG

                                                                                

In [15]:
economy = joined_df.withColumn("BUSINESS OPEN DATE", to_date(joined_df["BUSINESS OPEN DATE"], "MM/dd/yyyy"))
economy = economy.withColumn("BUSINESS CLOSE DATE", to_date(economy["BUSINESS CLOSE DATE"], "MM/dd/yyyy"))

# Group by location and calculate the number of business openings and closures
location_trends = economy.groupBy("FULL STREET NAME").agg(
    count("BUSINESS OPEN DATE").alias("OPENINGS"),
    count("BUSINESS CLOSE DATE").alias("CLOSURES")
)

In [16]:
location_trends.show()



+--------------------+--------+--------+
|    FULL STREET NAME|OPENINGS|CLOSURES|
+--------------------+--------+--------+
|       W WOODGATE CT|       1|       0|
|     HONEYSUCKLE AVE|       1|       0|
|    JEAN LAFITTE AVE|       4|       0|
|            EATON ST|       6|       1|
|       LEESVILLE AVE|       1|       0|
|GULF STATES UTILI...|       2|       1|
|     SAINT PETER AVE|       1|       0|
|     W SPRINGWIND CT|       3|       0|
|            75TH AVE|       6|       2|
|        CLAIRMONT DR|       1|       0|
|       ENGELSWOOD ST|       2|       0|
|         CHATAWA AVE|       2|       0|
|          RAPIDES ST|       3|       0|
|       SOMERUELOS ST|       4|       0|
|          STUART AVE|       5|       0|
|     KENILWORTH PKWY|       3|       1|
|        S CHOCTAW DR|     110|      21|
|  INNOVATION PARK DR|      20|       2|
|        CITIPLACE CT|      46|       7|
|       MC ILHENNY DR|       3|       0|
+--------------------+--------+--------+
only showing top

                                                                                