In [15]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

In [16]:
# businesses data
csvFilePath = './Businesses_Registered_with_EBR_Parish_20240221.csv'
df = spark.read.csv(csvFilePath, header=True, inferSchema=True)

# address data
addressCsvFilePath = './Street_Address_Listing_20240226.csv'
dfAddress = spark.read.csv(addressCsvFilePath, header=True, inferSchema=True)


In [17]:
# Convert DataFrame to JSON string
df_json = df.toJSON()
df_address_json = dfAddress.toJSON()

# Define Kafka topic and bootstrap servers
kafka_topic = "bigdata"
bootstrap_servers = "localhost:9092"

In [18]:
from kafka import KafkaProducer

# Initialize Kafka producer
producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

# Iterate through JSON records and send to Kafka
for record in df_json.collect():
    producer.send(kafka_topic, record.encode('utf-8'))

for record in df_address_json.collect():
    producer.send(kafka_topic, record.encode('utf-8'))


                                                                                

In [19]:
# Display the first 5 records from the RDD of Businesses
for record in df_json.take(50):
    print(record)

print("--------------------------------------")



{"ACCOUNT NO":"00940829","ACCOUNT NAME":"CADIENNE CREATIONS","LEGAL NAME":"CADIENNE CREATIONS LLC","ACCOUNT LOCATION CODE":9,"ACCOUNT LOCATION":"Outside EBR Parish","CONTACT PERSON":"CARISSA MABINI","BUSINESS OPEN DATE":"11/12/2023","BUSINESS STATUS":"O","OWNERSHIP TYPE":"LLC","ACCOUNT TYPE CODE":9999,"ACCOUNT TYPE":"No Occupational License","HOME-BASED BUSINESS":0,"NAICS CODE":454000,"NAICS CATEGORY":"Nonstore Retailers","NAICS GROUP":"RETAIL TRADE (G)","ABC STATUS CODE":0,"ABC STATUS":"Not  a  ABC vendor","FILING FREQUENCY":"M","MAILING ADDRESS - LINE 2":"23001 ELTON DRIVE","MAILING ADDRESS - CITY":"JENNINGS","MAILING ADDRESS - STATE":"LA","MAILING ADDRESS - ZIP CODE":"70546","PHYSICAL ADDRESS - LINE 2":"23001 ELTON DRIVE","PHYSICAL ADDRESS - CITY":"JENNINGS","PHYSICAL ADDRESS - STATE":"LA","PHYSICAL ADDRESS - ZIP CODE":"70546"}
{"ACCOUNT NO":"00049822","ACCOUNT NAME":"THAD S BROUSSARD","LEGAL NAME":"THAD BROUSSARD","ACCOUNT LOCATION CODE":2,"ACCOUNT LOCATION":"EBR Parish (EBR School

In [20]:
# Display the first 5 records from the RDD of Businesses
for record in df_address_json.take(5):
    print(record)

{"ADDRESS NO":"7353 STE B 282","STREET NAME":"HIGHLAND","STREET SUFFIX TYPE":"RD","FULL ADDRESS":"7353 HIGHLAND RD, STE B 282","CITY":"BATON ROUGE","ZIP":70808,"COUNCIL DISTRICT NO":12,"COUNCILPERSON NAME":"Jennifer Racca","JURISDICTION":"BATON ROUGE"}
{"ADDRESS NO":"9007 STE 9","STREET NAME":"HIGHLAND","STREET SUFFIX TYPE":"RD","FULL ADDRESS":"9007 HIGHLAND RD, STE 9","CITY":"BATON ROUGE","ZIP":70810,"COUNCIL DISTRICT NO":12,"COUNCILPERSON NAME":"Jennifer Racca","JURISDICTION":"BATON ROUGE"}
{"ADDRESS NO":"5830 STE A6","STREET PREFIX DIRECTION":"S","STREET NAME":"SHERWOOD FOREST","STREET SUFFIX TYPE":"BLVD","FULL ADDRESS":"5830 S SHERWOOD FOREST BLVD, STE A6","CITY":"BATON ROUGE","ZIP":70816,"COUNCIL DISTRICT NO":8,"COUNCILPERSON NAME":"Denise Amoroso","JURISDICTION":"PARISH"}
{"ADDRESS NO":"4520 STE 103","STREET PREFIX DIRECTION":"S","STREET NAME":"SHERWOOD FOREST","STREET SUFFIX TYPE":"BLVD","FULL ADDRESS":"4520 S SHERWOOD FOREST BLVD, STE 103","CITY":"BATON ROUGE","ZIP":70816,"COUN

In [21]:
# Schema of Businnesses
df.printSchema()

root
 |-- ACCOUNT NO: string (nullable = true)
 |-- ACCOUNT NAME: string (nullable = true)
 |-- LEGAL NAME: string (nullable = true)
 |-- ACCOUNT LOCATION CODE: integer (nullable = true)
 |-- ACCOUNT LOCATION: string (nullable = true)
 |-- CONTACT PERSON: string (nullable = true)
 |-- BUSINESS OPEN DATE: string (nullable = true)
 |-- BUSINESS STATUS: string (nullable = true)
 |-- BUSINESS CLOSE DATE: string (nullable = true)
 |-- OWNERSHIP TYPE: string (nullable = true)
 |-- ACCOUNT TYPE CODE: integer (nullable = true)
 |-- ACCOUNT TYPE: string (nullable = true)
 |-- HOME-BASED BUSINESS: integer (nullable = true)
 |-- NAICS CODE: integer (nullable = true)
 |-- NAICS CATEGORY: string (nullable = true)
 |-- NAICS GROUP: string (nullable = true)
 |-- ABC STATUS CODE: integer (nullable = true)
 |-- ABC STATUS: string (nullable = true)
 |-- CONSOLIDATED FILER: integer (nullable = true)
 |-- FILING FREQUENCY: string (nullable = true)
 |-- MAILING ADDRESS - LINE 1: string (nullable = true)
 |

In [22]:
# Schema of Address
dfAddress.printSchema()

root
 |-- ADDRESS NO: string (nullable = true)
 |-- STREET PREFIX DIRECTION: string (nullable = true)
 |-- STREET PREFIX TYPE: string (nullable = true)
 |-- STREET NAME: string (nullable = true)
 |-- STREET SUFFIX TYPE: string (nullable = true)
 |-- STREET SUFFIX DIRECTION: string (nullable = true)
 |-- STREET EXTENSION: string (nullable = true)
 |-- FULL ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- COUNCIL DISTRICT NO: integer (nullable = true)
 |-- COUNCILPERSON NAME: string (nullable = true)
 |-- JURISDICTION: string (nullable = true)



In [23]:
# geospatial distribution
geospatial_distribution = df.groupBy("PHYSICAL ADDRESS - CITY", "PHYSICAL ADDRESS - STATE", "BUSINESS OPEN DATE").count()

In [24]:
# economic growth
economic_growth_decline = df.groupBy("PHYSICAL ADDRESS - STATE", "BUSINESS OPEN DATE").agg({"ACCOUNT NO": "count"})

In [25]:
# suited business
business_region_analysis = df.groupBy("PHYSICAL ADDRESS - STATE", "NAICS CATEGORY").agg({"ACCOUNT NO": "count"})