In [None]:
import pandas as pd
import numpy as np

# Filter out deprecated warnings
import warnings
warnings.filterwarnings("ignore")

# 1. Loading Data

In [None]:
# Using data for green cabs, year 2016, February, March and April
green2 = pd.read_csv('../raw_data/green_tripdata_2016-02.csv')
green3 = pd.read_csv('../raw_data/green_tripdata_2016-03.csv')
green4 = pd.read_csv('../raw_data/green_tripdata_2016-04.csv')

In [None]:
green1 = pd.read_csv('../raw_data/green_tripdata_2016-01.csv')
green5 = pd.read_csv('../raw_data/green_tripdata_2016-05.csv')
green6 = pd.read_csv('../raw_data/green_tripdata_2016-06.csv')
green7 = pd.read_csv('../raw_data/green_tripdata_2016-07.csv')
green8 = pd.read_csv('../raw_data/green_tripdata_2016-08.csv')
green9 = pd.read_csv('../raw_data/green_tripdata_2016-09.csv')
green10 = pd.read_csv('../raw_data/green_tripdata_2016-10.csv')
green11 = pd.read_csv('../raw_data/green_tripdata_2016-11.csv')
green12 = pd.read_csv('../raw_data/green_tripdata_2016-12.csv')

In [None]:
# combine data for green cabs into one dataframe
green_cabs = green2.append([green3, green4])

In [None]:
traffic_collisions = pd.read_csv('../raw_data/rows.csv?accessType=DOWNLOAD')

# 2. Cleaning Data

In [None]:
# Make all letters in the columns lower-case for consistency
green_cabs.columns= green_cabs.columns.str.lower()

1. In green_cabs, Ehail_fee consists of only 'nan', therefore should be removed. 
2. There is 'nan' in trip type, and trip type does not contribute much to our study.

In [None]:
green_cabs = green_cabs.drop(columns=['ehail_fee', 'trip_type '])

In [None]:
# Rearrange columns in both df
green_index = ['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude', 'pickup_longitude', 'lpep_dropoff_datetime', 'lpep_pickup_datetime', 'extra', 'fare_amount', 'mta_tax', 'passenger_count', 'payment_type', 'ratecodeid', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount', 'total_amount', 'trip_distance', 'vendorid', 'improvement_surcharge']

green_cabs = green_cabs[green_index]

In [None]:
def set_boundaries(df):
    """ Make sure that all coordinates are within New York City, 
        remove those that are not """
    
    MIN_LAT = 40.47739894
    MAX_LAT = 40.91617849
    MIN_LONG = -74.25909008
    MAX_LONG = -73.70018092
    
    df = df[(df['pickup_latitude'] >= MIN_LAT) & (df['pickup_latitude'] <= MAX_LAT)]
    df = df[(df['pickup_longitude'] >= MIN_LONG) & (df['pickup_longitude'] <= MAX_LONG)]
    
    df = df[(df['dropoff_latitude'] >= MIN_LAT) & (df['dropoff_latitude'] <= MAX_LAT)]
    df = df[(df['dropoff_longitude'] >= MIN_LONG) & (df['dropoff_longitude'] <= MAX_LONG)]
    
    return df

green_cabs = set_boundaries(green_cabs)

In [None]:
# These features should not contain negative values
positive_features = ['passenger_count', 'trip_distance', 'fare_amount', 'tip_amount', 'tolls_amount', 'total_amount']

for feature in positive_features:
    green_cabs = green_cabs[green_cabs[feature] >= 0]

1. Fare amount negative due to refund by the company to the passenger
2. 0 trip distance due to booking without attendance

1. There are rows where trip_distance is 0 but fare_amount is not zero. 
2. There are also rows where the dropoff time is the same as the pickup time, but the trip_distance is not zero <br>
These might be caused by a faulty taxi meter.

In [None]:
def faulty_meters(df):
    PICKUP_DATETIME = 5
    DROPOFF_DATETIME = 4
    
    df.iloc[:, PICKUP_DATETIME] = pd.to_datetime(df.iloc[:, PICKUP_DATETIME])
    df.iloc[:, DROPOFF_DATETIME] = pd.to_datetime(df.iloc[:, DROPOFF_DATETIME])
    
    # the meter doesn't move but fare is paid
    df2 = df[(df['fare_amount'] != 0) & (df['trip_distance'] == 0.0)]
    
    # time doesnt change but fare is paid, probably faulty meter
    df3 = df[(df.iloc[:, PICKUP_DATETIME] == df.iloc[:, DROPOFF_DATETIME]) & (df['fare_amount'] != 0.0)]
    
    
    new_df = df2.append(df3)
    return new_df, df

green_faulty_meters, green_cabs = faulty_meters(green_cabs)

In [None]:
green_cabs = pd.concat([green_cabs, green_faulty_meters, green_faulty_meters]).drop_duplicates(keep=False)

In [None]:
green_cabs = green_cabs.reset_index(drop=True)

In [None]:
green_cabs[['fare_amount', 'trip_distance']].plot.scatter(x='fare_amount',
                                                  y='trip_distance')

plt.show()

In [None]:
trip_outlier = green_cabs.index[green_cabs['trip_distance'] > 800].tolist()
green_cabs = green_cabs.drop(trip_outlier)

In [None]:
green_cabs[['fare_amount', 'trip_distance']].plot.scatter(x='fare_amount',
                                                  y='trip_distance')

plt.show()

In [None]:
green_cabs[green_cabs['trip_distance'] > 200]

In [None]:
green_cabs = green_cabs.drop(2368028)

In [None]:
green_cabs[['fare_amount', 'trip_distance']].plot.scatter(x='fare_amount',
                                                  y='trip_distance')

plt.show()

In [None]:
green_cabs = green_cabs.dropna()

In [None]:
green_cabs = green_cabs.reset_index(drop=True)

# 3. PreProcessing

In [None]:
# Separate time into days of the week, hours
green_cabs['weekday'] = green_cabs.lpep_pickup_datetime.apply(lambda t: t.weekday())
green_cabs['hour'] = green_cabs.lpep_pickup_datetime.apply(lambda t: t.hour)

#yellow_cabs['weekday'] = yellow_cabs.tpep_pickup_datetime.apply(lambda t: t.weekday())
#yellow_cabs['hour'] = yellow_cabs.tpep_pickup_datetime.apply(lambda t: t.hour)

In [None]:
import geopandas as gpd

# sf stands for shape file
sf = gpd.read_file("../raw_data/taxi_zones/taxi_zones.shp")
zone = pd.read_csv("../raw_data/taxi_zones/taxi+_zone_lookup.csv")

# Convert the geometry shaape to to latitude and longitude
# Please attribute this if you are using it
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [None]:
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

zones = list(sf.borough.unique())
zone_dict = {}

for area in zones:
    locator = Nominatim(user_agent = "myGeocoder")
    location = locator.geocode(area)
    zone_dict[area] = location.raw['boundingbox']
    

In [None]:
# Coordinates between JFK Airport pickup and dropoff zone
JFK_LAT = 40.644456
JFK_LON = -73.782875

# Given the distance in metres, calculate the change in degree
distance = 10
lat_angle = 0.001 * (distance / 2.0) * np.cos(JFK_LAT)
lon_angle = 0.001 * (distance / 2.0) * np.cos(JFK_LON)
lat_min = str(JFK_LAT + lat_angle)
lat_max = str(JFK_LAT - lat_angle)
lon_min = str(JFK_LON + lon_angle)
lon_max = str(JFK_LON - lon_angle)
zone_dict['JFK'] = [lat_min, lat_max, lon_min, lon_max]

In [None]:
wanted_order = ['JFK', 'EWR', 'Manhattan', 'Staten Island', 'Queens', 'Brooklyn', 'Bronx']

zone_dict = {p: zone_dict[p] for p in wanted_order}

In [None]:
def find_zone(df, ride_type):
    """ Find the pickup and dropoff zones based on their latitude and longitude """
    
    zone_list = []
    
    for i in range(len(df)):
        
        if ride_type == "":
            lat = df.loc[i, 'LATITUDE']
            lon = df.loc[i, 'LONGITUDE']
        else: 
            lat = df.loc[i, ride_type + '_latitude']
            lon = df.loc[i, ride_type + '_longitude']

        toggle = 1
        for zone in zone_dict:

            MIN_LAT = float(zone_dict[zone][0])
            MAX_LAT = float(zone_dict[zone][1])
            MIN_LON = float(zone_dict[zone][2])
            MAX_LON = float(zone_dict[zone][3])

            if ((MIN_LAT <= lat <= MAX_LAT) & (MIN_LON <= lon <= MAX_LON)):
                zone_list.append(zone)
                toggle = 0
                break
        
        # insert nan when the coordinate is not in any zone
        if toggle:
            zone_list.append(np.nan)
            
    return zone_list

green_cabs['pickup_zone'] = find_zone(green_cabs, 'pickup')
green_cabs['dropoff_zone'] = find_zone(green_cabs, 'dropoff')

In [None]:
green_cabs

# External Datasets

### Preprocessing

In [None]:
traffic_collisions = traffic_collisions.dropna(subset=['LATITUDE', 'LONGITUDE'])
collisions_2016 = traffic_collisions[traffic_collisions['CRASH DATE'].str.endswith('2016')]
collisions_2016 = collisions_2016[(collisions_2016['CRASH DATE'].str.startswith('02')) | (collisions_2016['CRASH DATE'].str.startswith('03')) | (collisions_2016['CRASH DATE'].str.startswith('04'))]

In [None]:
# select relevant columns
wanted = ['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LONGITUDE', 'LATITUDE',
          'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED', 
          'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED', 
          'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED', 
          'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']

collisions = collisions_2016[wanted]
collisions = collisions.reset_index(drop=True)

In [None]:
# replace nan in 'BOROUGHS' with zones
collision_zone = find_zone(collisions, "")
collisions['BOROUGH'] = collision_zone
collisions = collisions.dropna()

In [None]:
# Set boundaries for collisions_df
MIN_LAT = 40.47739894
MAX_LAT = 40.91617849
MIN_LONG = -74.25909008
MAX_LONG = -73.70018092

collisions = collisions[(collisions['LATITUDE'] >= MIN_LAT) & (collisions['LATITUDE'] <= MAX_LAT)]
collisions = collisions[(collisions['LONGITUDE'] >= MIN_LONG) & (collisions['LONGITUDE'] <= MAX_LONG)]

In [None]:
# convert crash time to datetime
collisions['CRASH TIME'] = collisions['CRASH TIME'] + ':00'
collisions['DATETIME'] = collisions['CRASH DATE'] + ' ' + collisions['CRASH TIME']
collisions['DATETIME'] = pd.to_datetime(collisions['DATETIME'])

In [None]:
# Assumes an affected time period of 1 hour
collisions['ENDTIME'] = collisions['DATETIME'] + pd.Timedelta(hours=1)

## Number of people killed and injured does not intercept with green_cabs

In [None]:
# Select useful columns and fill NAN with 0s'
final_wanted = ['DATETIME', 'ENDTIME', 'BOROUGH']
collisions_df = collisions[final_wanted]

In [None]:
collisions_df

In [None]:
green_cabs

In [None]:
def affected(ori_df, ext_df):
    """ Find rows affected by traffic collisions,
        returns two lists,
        dropoffs affected and pickups affected """
    
    pickup_time = list(ori_df['lpep_pickup_datetime'])
    pickup_zone = list(ori_df['pickup_zone'])
    dropoff_time = list(ori_df['lpep_dropoff_datetime'])
    dropoff_zone = list(ori_df['dropoff_zone'])
    
    # use same function to find the affected rows
    pickup_affected = drop_pick_affected(ext_df, pickup_zone, pickup_time)
    dropoff_affected = drop_pick_affected(ext_df, dropoff_zone, dropoff_time)
    
    return pickup_affected, dropoff_affected

def drop_pick_affected(ext_df, zone, time):
    """ Find affected rows where pickups or dropoffs are affected,
        returns a list of affected rows """
    
    affected = []
    
    crash_time = list(ext_df['DATETIME'])
    crash_end = list(ext_df['ENDTIME'])
    crash_zone = list(ext_df['BOROUGH'])
    
    for i in range(len(time)):
        
        # 1 is given if a row is affected
        # 0 when a row is unaffected
        affected_bool = 0
        
        for j in range(len(crash_time)):
            
            # if time is between crash time and affected period
            # and is in the same zone
            if (crash_time[j] < time[i] <= crash_end[j]) & (crash_zone[j] == zone[i]):
                affected_bool = 1

            break
            
        affected.append(affected_bool)
        
    return affected

In [None]:
pickup_affected, dropoff_affected = affected(green_cabs, collisions_df)

In [None]:
green_cabs['pickup_affected_by_collisions'] = pickup_affected
green_cabs['dropoff_affected_by_collisions'] = dropoff_affected

In [None]:
green_cabs = green_cabs.reset_index(drop=True)

# 4. Geospatial Visualisation

In [None]:
import folium
from folium.plugins import HeatMap
from bokeh.tile_providers import get_provider

# Coordinates
PICKUP_COORD = ['pickup_latitude', 'pickup_longitude']
DROPOFF_COORD = ['dropoff_latitude', 'dropoff_longitude']

# mid_coord = the middle coordinates for the map
mid_coord = green_cabs[PICKUP_COORD].describe().loc[["50%"]].values[0]

# axis ranges
x_Range = [green_cabs['pickup_longitude'].min(), green_cabs['pickup_longitude'].max()]
y_Range = [green_cabs['pickup_latitude'].min(), green_cabs['pickup_latitude'].max()]

TILE = get_provider("STAMEN_TERRAIN_RETINA")

In [None]:
nyc_m = folium.Map(location=mid_coord, tiles="Stamen Terrain", zoom_start=11)

nyc_m.save('../mast30034_2021_s2_project_1-YourTeacher23/plots/folium_nyc.html')

nyc_m

In [None]:
import folium
from folium.plugins import FastMarkerCluster
from bokeh.plotting import figure, show, output_file, save
from bokeh.tile_providers import  Vendors
from bokeh.models import ColorBar, LinearColorMapper
from bokeh.palettes import all_palettes

# to display bokeh plots inside jupyter, we need to use output_notebook
from bokeh.io import reset_output, output_notebook

reset_output()
output_notebook()
# note below that it says "BokehJS 1.4.0 successfully loaded."

In [None]:
""" This code is taken from the Python Stream Workshop Repository, 
    https://github.com/akiratwang/MAST30034_Python/blob/main/tutorials/Lab1_Python.ipynb """

def latitude_to_mercator(coords):
    """ Function which converts an array of latitude coordinates 
        into its mercator coordinate representation """
    k = 6378137
    converted = list()
    for lat in coords:
        converted.append(np.log(np.tan((90 + lat) * np.pi/360.0)) * k)
    return converted

def longitude_to_mercator(coords):
    """
    Function which converts an array of longitude coordinates 
    into its mercator coordinate representation
    """
    k = 6378137
    converted = list()
    for lon in coords:
        converted.append(lon * (k * np.pi/180.0))
    return converted

In [None]:
def convert_mercator(df):
    """ Add mercator columns to dataframe """
    
    df['pickupX'] = df['pickup_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
    df['pickupY'] = df['pickup_latitude'].apply(lambda x: latitude_to_mercator([x])[0])
    df['dropoffX'] = df['dropoff_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
    df['dropoffY'] = df['dropoff_latitude'].apply(lambda x: latitude_to_mercator([x])[0])
    
    return df

green_cabs = convert_mercator(green_cabs)

In [None]:
from bokeh.io import curdoc
from bokeh.models import Model

def clear_doc(p):
    """ Clears doc memory for plots """
    curdoc().clear()
    for model in p.select({'type': Model}):
        prev_doc = model.document
        model._document = None
        if prev_doc:
            prev_doc.remove_root(model)
    return

In [None]:
def scatter_map(df, state): 
    """ Creates a scatter map """
    
    if state == "pickup":
        COLOUR = "white"
        F_COLOUR = "blue"
    elif state =="dropoff":
        COLOUR = "pink"
        F_COLOUR = "red"
    else:
        COLOUR = "random"
        F_COLOUR = "random"
        

    m = figure(x_range=longitude_to_mercator(x_Range), y_range=latitude_to_mercator(y_Range),
           x_axis_type="mercator", y_axis_type="mercator")
    m.add_tile(TILE)
    m.title.text = state + " in NYC"

    # for every source value, draw a small circle denoting a pickup
    m.circle(x=state + 'X', y=state + 'Y', 
             size=5, color=COLOUR, fill_color=F_COLOUR, fill_alpha=0.5, 
             source=df[[state + 'X',state + 'Y']])
    
    
    clear_doc(m)
    show(m)
    
    return

In [None]:
collisions['X'] = collisions['LONGITUDE'].apply(lambda x: longitude_to_mercator([x])[0])
collisions['Y'] = collisions['LATITUDE'].apply(lambda x: latitude_to_mercator([x])[0])

In [None]:
m = figure(x_range=longitude_to_mercator(x_Range), y_range=latitude_to_mercator(y_Range),
       x_axis_type="mercator", y_axis_type="mercator")
m.add_tile(TILE)
m.title.text = "Collisions in NYC"

# for every source value, draw a small circle denoting a pickup
m.circle(x='X', y='Y', 
         size=5, color='pink', fill_color='red', fill_alpha=0.5, 
         source=collisions[['X','Y']])

In [None]:
def cluster_map(df, state, note):
    
    """ Creates a cluster map """
    
    # create an interactive geospatial graph
    cluster = folium.Map(location=mid_coord, tiles="Stamen Terrain", zoom_start=10)

    # use a built-in clustering algorithm to apply markers for hotspots
    cluster.add_child(FastMarkerCluster(data=df[[state+"_latitude", state+"_longitude"]].values))

    # visualize the plot 
    cluster.save('../mast30034_2021_s2_project_1-YourTeacher23/plots/' + note + state + 'Cluster.html')
    #cluster
    
    return

In [None]:
cluster_map(green_cabs, 'dropoff', "")

In [None]:
cluster_map(green_cabs, 'pickup', "")

In [None]:
# create an interactive geospatial graph
cluster = folium.Map(location=mid_coord, tiles="Stamen Terrain", zoom_start=10)

# use a built-in clustering algorithm to apply markers for hotspots
cluster.add_child(FastMarkerCluster(data=collisions[["LATITUDE", "LONGITUDE"]].values))
cluster.save('../mast30034_2021_s2_project_1-YourTeacher23/plots/CollisionsCluster.html')

In [None]:
def heat_map(df, state, note):
    
    """ Creates a heatmap """
    
    heatmap = folium.Map(location=mid_coord, tiles="Stamen Terrain", zoom_start=10)
    heatmap.add_child(HeatMap(df[[state+"_latitude", state+"_longitude"]].values, radius=10))

    heatmap.save('../mast30034_2021_s2_project_1-YourTeacher23/plots/' + note + state + 'Heatmap.html')
    #heatmap
    return

In [None]:
def hex_map(df, state, note):
    
    # create bokeh figure, where x_range and y_range are in mercer
    hexmap = figure(x_range=longitude_to_mercator(x_Range), y_range=latitude_to_mercator(y_Range),
               x_axis_type="mercator", y_axis_type="mercator")
    
    # add map tile
    hexmap.add_tile(TILE)
    # change title
    hexmap.title.text = note + " " + state + " in NYC"

    palette = all_palettes['Magma'][256][::-1]
    color_mapper = LinearColorMapper(palette=palette, low=1, high=1449)
    color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12)
    r, bins = hexmap.hexbin(x=df[state+'X'], y=df[state+'Y'], size=250, palette=palette)

    hexmap.add_layout(color_bar, 'right')

    
    clear_doc(hexmap)
    show(hexmap)
    save(hexmap, '../mast30034_2021_s2_project_1-YourTeacher23/plots/' + note + state + 'HexMap.html')
    
    return

In [None]:
heat_map(green_cabs, "pickup", "")
heat_map(green_cabs, "pickup", "")

In [None]:
hex_map(green_cabs, "pickup", "")
hex_map(green_cabs, "dropoff", "")

In [None]:
six_am = green_cabs[green_cabs['hour'] == 6]
eleven_pm  = green_cabs[green_cabs['hour'] == 23]

In [None]:
cluster_map(six_am, "pickup", "6am")
cluster_map(six_am, "dropoff", "6am")

In [None]:
cluster_map(eleven_pm, "pickup", "11pm")
cluster_map(eleven_pm, "dropoff", "11pm")

In [None]:
hex_map(six_am, "pickup", "6am")
hex_map(eleven_pm, "pickup", "11pm")

In [None]:
hex_map(six_am, "dropoff", "6am")
hex_map(eleven_pm, "dropoff", "11pm")

### The plot shows there are more rides at 11pm than 6am despite the fare per mile being more expensive at 6am than 11pm, this shows that the fare per mile does not affect the number of rides at any time.

In [None]:
affected_rides = green_cabs[green_cabs['dropoff_affected_by_collisions'] == 1]
scatter_map(affected_rides, "dropoff")

# Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
CORR_ = ["passenger_count", "trip_distance", "fare_amount", "extra", 
 "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount","weekday", "hour"]

In [None]:
sns.heatmap(green_cabs[CORR_].corr())

plt.show()

In [None]:
print("Average $USD/Mile : {:0.2f}".format(green_cabs.fare_amount.sum()/green_cabs.trip_distance.sum()))

In [None]:
green_cabs['fare/mile'] = green_cabs.fare_amount / green_cabs.trip_distance
green_cabs['fare/mile'].describe()

In [None]:
# display pivot table
green_cabs.pivot_table('fare/mile', index='hour').plot(figsize=(14,6))
plt.ylabel('Fare $USD / mile');

### This plot shows that 0500 and 1600 have higher fare_per_mile, and lowest around midnight

In [None]:
# display pivot table
green_cabs.pivot_table('fare/mile', index='weekday').plot(figsize=(14,6))
plt.ylabel('Fare $USD / mile');

### This shows that weekdays have higher fare_per_mile, peaking at 7.9 on Tuesdays, while weekends have lower fare per mile, with the lowest on Saturdays at a little over 7.0

In [None]:
pickup_vs_fare = sns.barplot(x="pickup_zone", y="fare/mile", data=green_cabs, ci=None)

In [None]:
dropoff_vs_fare = sns.barplot(x="dropoff_zone", y="fare/mile", data=green_cabs, ci=None)

In [None]:
pickup_collisions_vs_fare = sns.barplot(x="pickup_affected_by_collisions", y="fare/mile", data=green_cabs, ci=None)

In [None]:
rate_code_vs_fare = sns.barplot(x="ratecodeid", y="fare/mile", data=green_cabs, ci=None)

1= Standard rate <br>
2=JFK (Airport Fare) <br>
3=Newark (Airport Fare) <br>
4=Nassau or Westchester (Further from downtown) <br>
5=Negotiated fare <br>
6=Group ride <br>

In [None]:
zone_vs_fare = sns.barplot(x="pickup_zone", y="fare/mile", data=green_cabs, ci=None)

In [None]:
ride_per_day = green_cabs.groupby(by=["weekday"]).count()
ride_per_day['weekday'] = ride_per_day.index

In [None]:
rides_vs_day = sns.barplot(x="weekday", y="pickupX", data=ride_per_day, ci=None)

In [None]:
ride_per_hour = green_cabs.groupby(by=["hour"]).count()
ride_per_hour['hour'] = ride_per_hour.index

In [None]:
rides_vs_hour = sns.barplot(x="hour", y="pickupX", data=ride_per_hour, ci=None)
rides_vs_hour.figure.savefig('rides_vs_hour.png',dpi=200)

In [None]:
ride_zone = green_cabs.groupby(by=["pickup_zone"]).count()
ride_zone['zone'] = ride_zone.index

In [None]:
rides_vs_zone = sns.barplot(x="zone", y="pickupX", data=ride_zone, ci=None)
rides_vs_zone.figure.savefig('rides_vs_zone.png',dpi=200)

##### COLLISIONS

In [None]:
crash_day = green_cabs.groupby(by=["weekday"]).sum()
crash_day['day'] = crash_day.index
# Notice that all collisions happen on Friday

In [None]:
crash_vs_day = sns.barplot(x="day", y="pickup_affected_by_collisions", data=crash_day, ci=None)

In [None]:
crash_hour = green_cabs.groupby(by=["hour"]).sum()
crash_hour['hour'] = crash_hour.index
# Notice that all collisions happen around 2pm and 3pm

In [None]:
crash_vs_hour = sns.barplot(x="hour", y="pickup_affected_by_collisions", data=crash_hour, ci=None)

In [None]:
crash_zone = green_cabs.groupby(by=["pickup_zone"]).sum()
crash_zone['zone'] = crash_zone.index
# Note that most collisions happen in Brooklyn

In [None]:
crash_vs_zone = sns.barplot(x="zone", y="pickup_affected_by_collisions", data=crash_zone, ci=None)