In [71]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col, count, expr, to_date

sparkCassandraConnectorFile = "./spark-cassandra-connector_2.12-3.5.0.jar"
spark = SparkSession \
    .builder \
    .appName('BigDataProject') \
    .config("spark.jars.packages", "pkg:maven/com.datastax.spark/spark-cassandra-connector_2.12@3.5.0") \
    .getOrCreate()
    # .config("spark.jars", sparkCassandraConnectorFile) \


csvFilePath = './Businesses_Registered_with_EBR_Parish_20240221.csv'
dfBusiness = spark.read.csv(csvFilePath, header=True, inferSchema=True)

csvBusinessLocationPath = './Street_Address_20240227.csv'
dfBusinessAddress = spark.read.csv(csvBusinessLocationPath, header=True, inferSchema=True)

# csvFilePathAddress = './Street_Address_Listing_20240226.csv'
# dfAddress = spark.read.csv(csvFilePathAddress, header=True, inferSchema=True)

In [72]:
# Filter out rows that are not in Baton Rouge, LA
filteredDfBusiness = dfBusiness.filter(
    (col("PHYSICAL ADDRESS - CITY") == "BATON ROUGE") &
    (col("PHYSICAL ADDRESS - STATE") == "LA")
)

dfBusinessAddress = dfBusinessAddress.filter(
    (col("BUSINESS NAME") != "")
)


joined_df = filteredDfBusiness.join(
        dfBusinessAddress,
        filteredDfBusiness["ACCOUNT NAME"] == dfBusinessAddress["BUSINESS NAME"],
        "inner"
    ).select(
        col("LEGAL NAME").alias("BUSINESS NAME"),
        col("BUSINESS OPEN DATE"),
        col("BUSINESS STATUS"),
        col("BUSINESS CLOSE DATE"),
        filteredDfBusiness["NAICS CODE"],
        filteredDfBusiness["NAICS CATEGORY"],
        filteredDfBusiness["NAICS GROUP"],
        col("FULL STREET NAME"),
        col("PHYSICAL ADDRESS - CITY").alias("CITY"),
        col("PHYSICAL ADDRESS - STATE").alias("STATE"),
        col("GEOMETRY"),
    )

distinct_categories = joined_df.select(col("NAICS CATEGORY")).distinct()
distinct_groups = joined_df.select(col("NAICS GROUP")).distinct()

print(distinct_categories.count())
print(distinct_groups.count())

print(joined_df.count())

60
19
17407


In [73]:
# Group by NAICS CATEGORY, PHYSICAL ADDRESS - CITY, and PHYSICAL ADDRESS - STATE and count occurrences
# distribution = dfNew.groupBy("NAICS CATEGORY", "PHYSICAL ADDRESS - CITY", "PHYSICAL ADDRESS - STATE").count()
# distributionByState = joined_df.groupBy(
#     "FULL STREET NAME",
#     "NAICS CATEGORY",
# ).agg(count("*").alias("COUNT"))

distributionByState = joined_df.groupBy(
    "FULL STREET NAME",
    "NAICS CATEGORY",
).agg(
    count("*").alias("TOTAL COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'O' then 1 else 0 end)").alias("ACTIVE COUNT"),
    expr("sum(case when `BUSINESS STATUS` = 'C' then 1 else 0 end)").alias("CLOSED COUNT")
)


In [74]:
# print(distribution.show())
print(distributionByState.show(10))

+--------------------+--------------------+-----------+------------+------------+
|    FULL STREET NAME|      NAICS CATEGORY|TOTAL COUNT|ACTIVE COUNT|CLOSED COUNT|
+--------------------+--------------------+-----------+------------+------------+
|            CADDO ST|Personal & Laundr...|          1|           1|           0|
|  BELLE FOUNTAINE CT|Miscellaneous Oth...|          2|           2|           0|
|      ONE CALAIS AVE|Ambulatory Health...|          2|           2|           0|
|          NORTH BLVD|Administrative & ...|          2|           2|           0|
|         MALLORY AVE|  Nonstore Retailers|          1|           1|           0|
|  BAYOU FOUNTAIN AVE|Specialty Trade C...|          4|           4|           0|
|         HIGHLAND RD|Professional, Sci...|         34|          20|          14|
|     OLD HAMMOND HWY|  Nonstore Retailers|          7|           7|           0|
|       OAK ARBOR AVE|Miscellaneous Oth...|          1|           1|           0|
|GENERAL BEAUREG

In [75]:
economy = joined_df.withColumn("BUSINESS OPEN DATE", to_date(joined_df["BUSINESS OPEN DATE"], "MM/dd/yyyy"))
economy = economy.withColumn("BUSINESS CLOSE DATE", to_date(economy["BUSINESS CLOSE DATE"], "MM/dd/yyyy"))

# Group by location and calculate the number of business openings and closures
location_trends = economy.groupBy("FULL STREET NAME").agg(
    count("BUSINESS OPEN DATE").alias("OPENINGS"),
    count("BUSINESS CLOSE DATE").alias("CLOSURES")
)

In [76]:
location_trends.show()

+--------------------+--------+--------+
|    FULL STREET NAME|OPENINGS|CLOSURES|
+--------------------+--------+--------+
|       W WOODGATE CT|       1|       0|
|     HONEYSUCKLE AVE|       1|       0|
|    JEAN LAFITTE AVE|       4|       0|
|            EATON ST|       6|       1|
|       LEESVILLE AVE|       1|       0|
|GULF STATES UTILI...|       2|       1|
|     SAINT PETER AVE|       1|       0|
|     W SPRINGWIND CT|       3|       0|
|            75TH AVE|       6|       2|
|        CLAIRMONT DR|       1|       0|
|       ENGELSWOOD ST|       2|       0|
|         CHATAWA AVE|       2|       0|
|          RAPIDES ST|       3|       0|
|       SOMERUELOS ST|       4|       0|
|          STUART AVE|       5|       0|
|     KENILWORTH PKWY|       3|       1|
|        S CHOCTAW DR|     110|      21|
|  INNOVATION PARK DR|      20|       2|
|        CITIPLACE CT|      46|       7|
|       MC ILHENNY DR|       3|       0|
+--------------------+--------+--------+
only showing top

In [77]:
joined_df.printSchema()

root
 |-- BUSINESS NAME: string (nullable = true)
 |-- BUSINESS OPEN DATE: string (nullable = true)
 |-- BUSINESS STATUS: string (nullable = true)
 |-- BUSINESS CLOSE DATE: string (nullable = true)
 |-- NAICS CODE: integer (nullable = true)
 |-- NAICS CATEGORY: string (nullable = true)
 |-- NAICS GROUP: string (nullable = true)
 |-- FULL STREET NAME: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- GEOMETRY: string (nullable = true)



In [78]:
joined_df.dtypes

[('BUSINESS NAME', 'string'),
 ('BUSINESS OPEN DATE', 'string'),
 ('BUSINESS STATUS', 'string'),
 ('BUSINESS CLOSE DATE', 'string'),
 ('NAICS CODE', 'int'),
 ('NAICS CATEGORY', 'string'),
 ('NAICS GROUP', 'string'),
 ('FULL STREET NAME', 'string'),
 ('CITY', 'string'),
 ('STATE', 'string'),
 ('GEOMETRY', 'string')]

In [79]:
from cassandra.cluster import Cluster
# from cassandra.auth import PlainTextAuthProvider
# import pandas as pd

# Cassandra connection settings
contact_points = ['127.0.0.1']  # Cassandra contact points
keyspace = 'bigdata_keyspace'    # Keyspace name
# username = 'your_username'       # Cassandra username (if authentication is enabled)
# password = 'your_password'       # Cassandra password (if authentication is enabled)

# Connect to Cassandra cluster
# auth_provider = PlainTextAuthProvider(username=username, password=password) if username and password else None
cluster = Cluster(contact_points=contact_points)
session = cluster.connect()

# Use the keyspace
session.set_keyspace(keyspace)

In [84]:

# Prepare insert query
# table_name = f'{keyspace}.business_ebr'
# cassandra_column_names = [col.lower().replace(' ', '_') for col in joined_df.columns]
# column_names = ', '.join(cassandra_column_names)
# value_placeholders = ', '.join(['%s' for _ in joined_df.columns])
# insert_query = f"INSERT INTO {table_name} ({column_names}) VALUES ({value_placeholders})"

# for row in joined_df.collect():
#     # Execute the INSERT statement
#     session.execute(insert_query, tuple(row))

# Close the session and cluster connection
# session.shutdown()
# cluster.shutdown()
