## Re-reading the files that were generaed by the previous ipynb file

In [None]:
# Importing all the libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, lower, isnan, count
from geopy.geocoders import Nominatim

In [None]:
# Initialize S3rd spark session to clean the data
spark = SparkSession.builder.appName("DataCleaning4").getOrCreate()

In [None]:
# Reading the previously written data
df = spark.read.option("header", True).csv("Intermediate3")

## 9 E. Filling values for ZIP CODE, BOROUGH -> Using Nominatam Local Instance to Fill the data

**🚨 DO NOT RUN THIS THE FOLLOWING CELL 🚨**  
⚠️ This operation requires a local nominatim instance running in a docker contrainer for make queries. ⚠️  


In [None]:
# Function to get ZIP code and borough from latitude and longitude using geopy
geolocator = Nominatim(user_agent="geoapiExercises", domain="localhost:8080", scheme="http")

def get_location_info(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True, timeout=20)
        if location and 'postcode' in location.raw['address']:
            postcode = location.raw['address']['postcode']
            borough = location.raw['address'].get('borough', 'Unknown')
            return postcode, borough
        else:
            return None, None
    except Exception as e:
        return None, None

In [None]:
# Now apply the function to all the data
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def get_zipcode(lat, lon):
    return get_location_info(lat, lon)[0]

def get_borough(lat, lon):
    return get_location_info(lat, lon)[1]

get_zipcode_udf = udf(get_zipcode, StringType())
get_borough_udf = udf(get_borough, StringType())

df = df.withColumn("ZIP CODE", when((col("ZIP CODE") == 0) & col("LATITUDE").isNotNull() & col("LONGITUDE").isNotNull(), get_zipcode_udf(col("LATITUDE"), col("LONGITUDE"))).otherwise(col("ZIP CODE")))
df = df.withColumn("BOROUGH", when((col("ZIP CODE") == 0) & col("LATITUDE").isNotNull() & col("LONGITUDE").isNotNull(), get_borough_udf(col("LATITUDE"), col("LONGITUDE"))).otherwise(col("BOROUGH")))

In [None]:
# Filling missing information like ZIP CODE with 0, BOROUGH with Unknown
df = df.fillna({"ZIP CODE": 0, "BOROUGH": "Unknown"})

In [None]:
# Here we make all the borough data to lowercase
df = df.withColumn("BOROUGH", lower(col("BOROUGH")))

In [None]:
# Fixing all the incorrect BOROUGH names
df = df.withColumn("BOROUGH", when(col("BOROUGH") == "the bronx", "bronx").otherwise(col("BOROUGH")))

In [None]:
# Setting the ZIP CODE to 0 for the rows that do not liw in new york
df = df.withColumn("ZIP CODE", when((col("ZIP CODE") < 10000) | (col("ZIP CODE") > 12000), 0).otherwise(col("ZIP CODE")))

In [None]:
# Writing the totally cleaned dat into a new folder called "final_data"
df.write.mode("overwrite").option("header", True).csv("final_data")