## Re-Reading all the files that were written by the last ipynb file

In [None]:
# Importing the spark tools and libraries and some helper functions
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, udf
import requests
import time

In [None]:
# We start a new spark session for second cleaning phase
spark = SparkSession.builder.appName("DataCleaning2").getOrCreate()

In [None]:
# Now we load the cleaned data we saved from the "Intermediate" folder
df = spark.read.option("header", True).csv("Intermediate")

## 9 A. Filling missing latitude and longitude using external library

**🚨 DO NOT RUN THIS THE FOLLOWING CELL 🚨**  
⚠️ This operation takes approx **21 days** to complete if running on a single PC!, threading is not allowed to this API, nor is request.session() ⚠️  


In [None]:
# Here we define the function to get latitutde, longitude and then location name from the address using the NOMINATIM API
def fetch_location(address):
    url = f"https://nominatim.openstreetmap.org/search?format=json&q={address}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Brave/91.0.4472.124'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200 and len(response.json()) > 0:
        result = response.json()[0]
        return result['lat'], result['lon'], result['display_name']
    return None, None, None

fetch_location_udf = udf(fetch_location)

In [None]:
# we add 3 empty columns that we will use later to fill the data that is fetched from the API
df = df.withColumn("NLat", lit(None).cast("string")) \
       .withColumn("NLong", lit(None).cast("string")) \
       .withColumn("Location", lit(None).cast("string"))

# Now we keep only the rows in where latitude longitude and ZIP code is missing
df_filtered = df.filter(~(col("LATITUDE").isNotNull() & col("LONGITUDE").isNotNull() & col("ZIP CODE").isNotNull()))

# Now we have to parse all the data from the fetched data
for row in df_filtered.collect():
    address = row['Addresses']
    lat, lon, location = fetch_location(address)
    df = df.withColumn("NLat", when(col("Addresses") == address, lit(lat)).otherwise(col("NLat"))) \
           .withColumn("NLong", when(col("Addresses") == address, lit(lon)).otherwise(col("NLong"))) \
           .withColumn("Location", when(col("Addresses") == address, lit(location)).otherwise(col("Location")))
    time.sleep(1)  # We need to slwwp for 1 second as this is the rate limit from the server

In [None]:
# Now we save the updated data with the new fields in the folder "Intermediate2"

df.write.mode("overwrite").option("header", True).csv("Intermediate2")

---  
**End Of DataCleaning2_PySpark.ipynb file**