In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, current_date, expr, regexp_extract, round, to_date

spark = (
    SparkSession
        .builder
        .appName('DataSetPreprocessingAndStreaming')
        .getOrCreate()
)

csvFilePath = './datasets/Businesses_Registered_with_EBR_Parish_20240221.csv'
dfBusiness = spark.read.csv(csvFilePath, header=True, inferSchema=True)

csvBusinessLocationPath = './datasets/Street_Address_20240227.csv'
dfBusinessAddress = spark.read.csv(csvBusinessLocationPath, header=True, inferSchema=True)

In [6]:
print(dfBusiness.count())
print(dfBusinessAddress.count())

127842
203824


In [3]:
# Filter out rows that are not in Baton Rouge, LA and does not have a business_name & naics_group is not defined
filteredDfBusiness = dfBusiness.filter(
    (col("PHYSICAL ADDRESS - CITY") == "BATON ROUGE") &
    (col("PHYSICAL ADDRESS - STATE") == "LA") &
    (col('LEGAL NAME') != '') &
    (col('NAICS GROUP') != '')
)

joined_df = filteredDfBusiness.join(
        dfBusinessAddress,
        filteredDfBusiness["ACCOUNT NAME"] == dfBusinessAddress["BUSINESS NAME"],
        "inner"
    ).select(
        col("LEGAL NAME").alias("BUSINESS NAME"),
        col("BUSINESS OPEN DATE"),
        col("BUSINESS STATUS"),
        col("BUSINESS CLOSE DATE"),
        filteredDfBusiness["NAICS CODE"],
        filteredDfBusiness["NAICS CATEGORY"],
        filteredDfBusiness["NAICS GROUP"],
        col("FULL STREET NAME"),
        col("PHYSICAL ADDRESS - CITY").alias("CITY"),
        col("PHYSICAL ADDRESS - STATE").alias("STATE"),
        col("GEOMETRY"),
    )

# regular expression pattern to extract latitude and longitude
pattern = r'POINT \(([^ ]+) ([^ ]+)\)'

joined_df = joined_df.withColumn('LATITUDE', regexp_extract(col('GEOMETRY'), pattern, 1)) \
                        .withColumn('LONGITUDE', regexp_extract(col('GEOMETRY'), pattern, 2))
joined_df = joined_df.drop('GEOMETRY')

In [7]:
print(joined_df.count())

                                                                                

17392


In [8]:
# Define Kafka topic and bootstrap servers
kafka_topic = "big_data"
bootstrap_servers = "localhost:9092"

In [23]:
joined_df.columns
joined_df.printSchema()

root
 |-- BUSINESS NAME: string (nullable = true)
 |-- BUSINESS OPEN DATE: string (nullable = true)
 |-- BUSINESS STATUS: string (nullable = true)
 |-- BUSINESS CLOSE DATE: string (nullable = true)
 |-- NAICS CODE: integer (nullable = true)
 |-- NAICS CATEGORY: string (nullable = true)
 |-- NAICS GROUP: string (nullable = true)
 |-- FULL STREET NAME: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)



In [19]:
from kafka import KafkaProducer

# Create a Kafka producer
producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

# Convert DataFrame to JSON and select columns you want to publish
json_df = joined_df.toJSON()

# Iterate through JSON records and send to Kafka
for record in json_df.collect():
    producer.send(kafka_topic, record.encode('utf-8'))

                                                                                

In [20]:
producer.flush()

In [22]:
# writing the preprocessed DF to a csv file
newCSVPath = "./datasets/final_dataset"

joined_df.coalesce(1).write.csv(newCSVPath, header=True, sep=",", mode="overwrite")

                                                                                