# Step5-download taxi zone data & geo plot

## 1. Impot necessary modules & start a spark session

In [None]:
# Import necessary modules
from pyspark.sql import SparkSession
from urllib.request import urlretrieve
import pandas as pd
import geopandas as gpd
import folium
import zipfile
import os

In [None]:
# Create a Spark session
spark = (
    SparkSession.builder.appName('ADS_project_1.py')
    .config('spark.sql.repl.eagerEval.enabled', True)
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config('spark.sql.session.timeZone', 'Etc/UTC')
    .config('spark.driver.memory', '16g')
    .config('spark.executer.memory', '16g')
    .getOrCreate()
)

## 2. Taxi zones data download, import & overview

### 2.1 Create folders for taxi zones data

In [None]:
directory = '../data/taxi_zones'
# Check if the directory exists; if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

### 2.2 Download taxi zones data to the directory `data/taxi_zones/`

In [None]:
specific_data_url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'
output_path = ('../data/taxi_zones/taxi_zones.zip')
urlretrieve(specific_data_url, output_path)

specific_data_url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'
output_path = ('../data/taxi_zones/taxi+_zone_lookup.csv')
urlretrieve(specific_data_url, output_path)

### 2.3 Extract `taxi_zones.zip` into the folder `data/taxi_zones/`

In [None]:
zip_file_path = '../data/taxi_zones/taxi_zones.zip'
output_dir = '../data/taxi_zones/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

### 2.4 Import taxi zones data from directory `data/taxi_zones/`

In [None]:
taxi_zones_shape_file = gpd.read_file('../data/taxi_zones/taxi_zones.shp')
taxi_zones = pd.read_csv('../data/taxi_zones/taxi+_zone_lookup.csv')

### 2.5 Show taxi zones data

In [None]:
taxi_zones_shape_file.head()

In [None]:
taxi_zones.head()

## 3. Preparation before plotting choropleth map

### 3.1 Merge to get `taxi_zones`

Convert the values in feature 'geometry' of the shape file `taxi_zones_shape_file` to latitude and longitude

In [None]:
taxi_zones_shape_file['geometry'] = taxi_zones_shape_file['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
taxi_zones_shape_file.head()

Merge `taxi_zones_shape_file` & `taxi_zones` to be `taxi_zones`, based on the shared feature 'LocationID'

In [None]:
taxi_zones = gpd.GeoDataFrame(
    pd.merge(taxi_zones_shape_file, taxi_zones, on='LocationID', how='inner')
)

taxi_zones.head()

### 3.2 Create aggregated data

#### 3.2.1 Import `merged_data` from `data/merged_data/`

In [None]:
merged_data_path = '../data/merged_data/merged_data.parquet/'
merged_data = pd.read_parquet(merged_data_path)

#### 3.2.2 Create aggregated data `pickup_location_avg_duration` for 'up_location_id'

Do 'group by' by 'up_location_id', and calculate total 'trip_duration' & '#trips' inside each location

In [None]:
pickup_location_avg_duration = merged_data[['up_location_id', 'trip_duration']] \
                .groupby('up_location_id') \
                .agg(
                    {
                        'trip_duration': 'sum', # toal trip duration within the location
                        'up_location_id': 'count' # number of trips within the location
                    }
                ) \
                .rename({'up_location_id': '#trips'}, axis=1)

pickup_location_avg_duration.head()

Calculate the average trip duration within each location, and let it be a new feature 'average_duration'

In [None]:
pickup_location_avg_duration['average_duration'] = pickup_location_avg_duration['trip_duration'] / pickup_location_avg_duration['#trips']
pickup_location_avg_duration = pickup_location_avg_duration.reset_index().sort_values('average_duration', ascending=False)

In [None]:
pickup_location_avg_duration.head()

#### 3.2.3 Create aggregated data `dropoff_location_avg_duration` for 'off_location_id' (same thing as 3.2.2)

In [None]:
dropoff_location_avg_duration = merged_data[['off_location_id', 'trip_duration']] \
                .groupby('off_location_id') \
                .agg(
                    {
                        'trip_duration': 'sum', # toal trip duration within the location
                        'off_location_id': 'count' # number of trips within the location
                    }
                ) \
                .rename({'off_location_id': '#trips'}, axis=1)

dropoff_location_avg_duration['average_duration'] = dropoff_location_avg_duration['trip_duration'] / dropoff_location_avg_duration['#trips']
dropoff_location_avg_duration = dropoff_location_avg_duration.reset_index().sort_values('average_duration', ascending=False)

In [None]:
dropoff_location_avg_duration.head()

### 3.3 Merge aggregated data & `taxi_zones`

#### 3.3.1 Merge `pickup_location_avg_duration` & `taxi_zones`

 Merge to be `pickup_location_avg_duration`, based on 'up_location_id'

In [None]:
pickup_location_avg_duration = pickup_location_avg_duration \
    .merge(taxi_zones[['LocationID', 'zone', 'geometry']], left_on='up_location_id', right_on='LocationID') \
    .drop('LocationID', axis=1)

In [None]:
pickup_location_avg_duration.head()

#### 3.3.2 Merge `dropoff_location_avg_duration` & `taxi_zones` (same thing as 3.3.1)

 Merge to be `dropoff_location_avg_duration`, based on 'up_location_id'

In [None]:
dropoff_location_avg_duration = dropoff_location_avg_duration \
    .merge(taxi_zones[['LocationID', 'zone', 'geometry']], left_on='off_location_id', right_on='LocationID') \
    .drop('LocationID', axis=1)

In [None]:
dropoff_location_avg_duration.head()

## 4. Plot choropleth map based on aggregated data

### 4.1 Initialise the `geoJSON` file for choropleth map

In [None]:
# create a JSON 
geoJSON = taxi_zones[['LocationID', 'geometry']].drop_duplicates('LocationID').to_json()

### 4.2 Plot choropleth map, based on the aggregated data `pickup_location_avg_duration`

#### 4.2.1 Plot choropleth map for 'up_location_id'

In [None]:
# Initialize the folium map 
m = folium.Map(location=[40.66, -73.94], tiles="cartodb positron", zoom_start=10)

# Create the choropleth map to visualize aggregated data on the folium map
c = folium.Choropleth(
    geo_data = geoJSON,
    name = 'choropleth',
    data = pickup_location_avg_duration.reset_index(), # data source
    columns = ['up_location_id','average_duration'], # the columns required for plotting
    key_on = 'properties.LocationID',
    fill_color = 'YlOrRd',
    nan_fill_color = 'black',
    legend_name = 'Average Trip Duration (s)'
)

c.add_to(m)

m

#### 4.2.2 Mark the 10 'up_location_id' with the highest average duration

In [None]:
# Get the centroids for each location
pickup_location_avg_duration['centroid'] = pickup_location_avg_duration['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))

# Select the top 10 locations with the highest average trip durations
top10_pickup_location_avg_duration = pickup_location_avg_duration.nlargest(10, 'average_duration')
top10_pickup_location_avg_duration[['up_location_id', 'zone', 'centroid']].head()

In [None]:
# Loop the top 10 locations and add markers for them
for index, row in top10_pickup_location_avg_duration.iterrows():
    zone_name = row['zone']
    coord = row['centroid']

    # Add a marker to the map with the location's name
    m.add_child(
        folium.Marker(location=coord, popup=zone_name)
    )

# Save the map to an HTML file
m.save('../plots/top10_avg_trip_duration_for_pickup_location.html')
m

### 4.3 Plot choropleth map, based on the aggregated data `dropoff_location_avg_duration`
(same thing as 4.2)

#### 4.3.1 Plot choropleth map for 'off_location_id'

In [None]:
# Initialize the folium map 
m = folium.Map(location=[40.66, -73.94], tiles="cartodb positron", zoom_start=10)

# Create the choropleth map to visualize aggregated data on the folium map
c = folium.Choropleth(
    geo_data = geoJSON, 
    name = 'choropleth', 
    data = dropoff_location_avg_duration.reset_index(), # data source
    columns = ['off_location_id','average_duration'], # the columns required for plotting
    key_on = 'properties.LocationID', 
    fill_color = 'YlOrRd', 
    nan_fill_color = 'black',
    legend_name = 'Average Trip Duration (s)'
)

c.add_to(m)

m

#### 4.3.2 Mark the 10 'off_location_id' with the highest average duration

In [None]:
# Get the centroids for each location
dropoff_location_avg_duration['centroid'] = dropoff_location_avg_duration['geometry'].apply(lambda x: (x.centroid.y, x.centroid.x))

# Select the top 10 locations with the highest average trip durations
top10_dropoff_location_avg_duration = dropoff_location_avg_duration.nlargest(10, 'average_duration')
top10_dropoff_location_avg_duration[['off_location_id', 'zone', 'centroid']].head()

In [None]:
# Loop the top 10 locations and add markers for them
for index, row in top10_dropoff_location_avg_duration.iterrows():
    zone_name = row['zone']
    coord = row['centroid']

    # Add a marker to the map with the location's name
    m.add_child(
        folium.Marker(location=coord, popup=zone_name)
    )

# Save the map to an HTML file
m.save('../plots/top10_avg_trip_duration_for_dropoff_location.html')
m

## 5. Stop spark session

In [None]:
spark.stop()