In [None]:
import os
import glob
import json
import pandas as pd
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, hour
from pyspark.sql.types import IntegerType, LongType

# Analysis

In this part, we will retrieve, process and display the data to be able to draw some conclusions. Most of these analyses concern monthly evolutions of certain quantities (total number of trip, ...). In order not to have to rewrite code to plot the data, we will write a function.

For each dataset we will calculate the data and store them in a json, the function will then fetch the data in the different json files to plot the graph.

In [None]:
def plot_monthly(data_dir, data_label, title):
    filenames = sorted(glob.glob("plot_data/{}/*.json".format(data_dir)))
    
    n = len(filenames)
    fig, ax = plt.subplots(n,1, figsize = (20,n*5))
    fig.autofmt_xdate()
    
    for i, filename in zip(range(n), filenames):
        with open(filename) as f:
            data = json.load(f)
            months = data['months']
            months = [datetime(year=int(month[0:4]), 
                               month=int(month[5:]), day=1) for month in months]
            values = data['values']
            
            series = pd.Series(values, index=months)
            if n > 1:
                a = ax[i]
            else:
                a = ax
            series.plot(style='-', ax=a)
            a.set_ylabel(data_label)
            a.set_title('{} dataset'.format(os.path.basename(filename)[:-5]))
            a.tick_params(labelbottom=True)
            
    fig.suptitle(title, fontsize=26)
            
    plt.savefig('figures/{}.png'.format(data_dir))
    plt.close()


In [None]:
# get all the filename
hdfs_path = 'hdfs://public00:8020/user/hpda000034/infoh600/clean'
local_path = '/home/hpda00034/infoh600/sampled'

In [None]:
os.environ['HADOOP_CONF_DIR']="/etc/hadoop/conf"

# python configuration
os.environ['PYSPARK_PYTHON']="/usr/local/anaconda3/bin/python"
os.environ['PYSPARK_DRIVER_PYTHON']="/usr/local/anaconda3/bin/python"

from pyspark.sql import SparkSession
from pyspark import SparkFiles, SQLContext


# remove old spark session
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session, with YARN as resource manager, requesting 4 worker nodes.
spark = SparkSession \
    .builder \
    .master("yarn") \
    .config("spark.executor.instances","4") \
    .appName("project_ceci18") \
    .getOrCreate()

# Create slq spark context
sc=spark.sparkContext
sqlContext = SQLContext(sc)

### 4.1. Monthly total number of trips (per dataset type)

In [None]:
dataset = "yellow"

filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

n_trips = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True)
    # count the rows
    count = trips.count()
    n_trips.append(count)
    months.append(filename[-11:-4])

data = {
    'months': months,
    'values': n_trips
}

with open('plot_data/4_1/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_1', 'number of trips', 'Monthly total number of trips')

<img src="figures/4_1.png" width="1000" align="left"/>

### 4.2. Monthly total number of trips in Manhattan and Brooklyn (per dataset type)

In [None]:
# recuperate the zone ids for Manhattan and Brooklyn
zones = pd.read_csv('shape_files/zones.csv')

man = [loc_id-1 for loc_id in 
       list(zones.loc[zones['Borough']=='Manhattan', 'LocationID'])]
bro = [loc_id-1 for loc_id in 
       list(zones.loc[zones['Borough']=='Brooklyn', 'LocationID'])]

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

n_trips = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).persist()
   
    # count trip in Manhattan
    man_trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(man)) \
                     .filter(trips.dolocationid.cast(IntegerType()).isin(man))
    count_man = man_trips.count()
    
    # count trip in Brooklyn
    bro_trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(bro)) \
                     .filter(trips.dolocationid.cast(IntegerType()).isin(bro))
    count_bro = bro_trips.count()
    
    n_trips.append(count_man + count_bro)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': n_trips
}

with open('plot_data/4_2/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_2', 'number of trips', 'Monthly total number of trips (Man et Bro)')

<img src="figures/4_2.png" width="1000" align="left"/>

NB: For the FHV dataset, for a long time the graph is at 0 because for a long time the place of dropoff was not recorded.

### 4.3. Monthly total receipts (per dataset type)

In [None]:
receipt_column = {'green': ['fare_amount', 'extra', 'mta_tax', 
                            'tolls_amount', 'ehail_fee'],
                  'yellow': ['fare_amount', 'extra', 'mta_tax', 
                             'tolls_amount']}

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

total_receipts = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).persist()
    
    # compute total receipt in a new column
    trips = trips.withColumn('receipt',sum(trips[x] for x in receipt_column[dataset]))
    # sum on the new receipt column
    total_receipt = trips.agg(F.sum('receipt')).collect()[0][0]
    
    total_receipts.append(total_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': total_receipts
}

with open('plot_data/4_3/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_3', 'total receipt ($)', 'Monthly total receipts')

<img src="figures/4_3.png" width="1000" align="left"/>

It is the yellow taxi company that makes the most revenue, mainly because it makes many more trips, as can be seen in section 4.1.

### 4.4. Monthly average receipt (per dataset type)

In [None]:
receipt_column = {'green': ['fare_amount', 'extra', 'mta_tax', 
                            'tolls_amount', 'ehail_fee']
                  'yellow': ['fare_amount', 'extra', 'mta_tax', 
                             'tolls_amount']}

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

average_receipts = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).persist()
    
    # compute total receipt
    total_receipt = trips.withColumn('receipt',sum(trips[x] for x in receipt_column[dataset])) \
                         .agg(F.sum('receipt')).collect()[0][0]
    
    # compute the number of trips
    n_trips = trips.count()
    
    average_receipt = total_receipt/n_trips
    average_receipts.append(average_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_receipts
}

with open('plot_data/4_4/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_4', 'average receipt ($)', 'Monthly average receipts')

<img src="figures/4_4.png" width="1000" align="left"/>

### 4.5. Monthly average cost per in-progress-minute (per dataset type)

In [None]:
receipt_column = {'green': ['fare_amount', 'extra', 'mta_tax', 
                            'tolls_amount', 'ehail_fee'],
                  'yellow': ['fare_amount', 'extra', 'mta_tax', 
                             'tolls_amount']}

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

pickup_datetime = {'yellow': 'tpep_pickup_datetime',
                   'green': 'lpep_pickup_datetime'}

dropoff_datetime = {'yellow': 'tpep_dropoff_datetime',
                    'green': 'lpep_dropoff_datetime'}


def time_delta(end,start):
    end = end.cast(LongType())
    start = start.cast(LongType())
    delta = (end-start)/60
    return delta

f = udf(time_delta, IntegerType())


average_receipts = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True)\
                      .fillna({pickup_datetime[dataset]:0,
                               dropoff_datetime[dataset]:0})\
                      .persist()
    
    # filter empty datetime
    trips = trips.filter(trips[pickup_datetime[dataset]].cast(LongType()) != 0) \
                 .filter(trips[dropoff_datetime[dataset]].cast(LongType()) != 0) \
                 .persist()
    
    # compute total time of the trips
    total_time = trips.withColumn('time',time_delta(trips[dropoff_datetime[dataset]], 
                                                    trips[pickup_datetime[dataset]])) \
                      .agg(F.sum('time')).collect()[0][0]
    
    # compute total receipt
    total_receipt = trips.withColumn('receipt',sum(trips[x] for x in receipt_column[dataset])) \
                         .agg(F.sum('receipt')).collect()[0][0]
    
    average_receipt = total_receipt/total_time
    average_receipts.append(average_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_receipts
}

with open('plot_data/4_5/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_5', 'cost per minutes ($)', 'Average cost per in progress minute')

<img src="figures/4_5.png" width="1000" align="left"/>

Prior to 2016, the price per minute was equivalent for yellow and green taxis.

After 2016, green taxis became cheaper per minute.

### 4.6. Monthly average tip (per dataset type)

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

average_tips = []
months = []
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).persist()
    
    # compute total of the tips
    total_tip = trips.agg(F.sum('tip_amount')).collect()[0][0]
    
    # compute number of trips
    n_trips = trips.count()
    
    average_tip = total_tip/n_trips
    average_tips.append(average_tip)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_tips
}

with open('plot_data/4_6/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_6', 'average tip ($)', 'Monthly average tip')

<img src="figures/4_6.png" width="1000" align="left"/>

On average, yellow taxi drivers get the biggest tips.

### 4.7. Median monthly average trip speed (per borough, per dataset)

In [None]:
# get the index for each borough
zones = pd.read_csv('shape_files/zones.csv')

manhattan = [loc_id-1 for loc_id in 
             list(zones.loc[zones['Borough']=='Manhattan', 'LocationID'])]
brooklyn = [loc_id-1 for loc_id in 
            list(zones.loc[zones['Borough']=='Brooklyn', 'LocationID'])]
queens = [loc_id-1 for loc_id in 
          list(zones.loc[zones['Borough']=='Queens', 'LocationID'])]
bronx = [loc_id-1 for loc_id in 
         list(zones.loc[zones['Borough']=='Bronx', 'LocationID'])]
staten = [loc_id-1 for loc_id in 
          list(zones.loc[zones['Borough']=='Staten Island', 'LocationID'])]

boroughs = {
    'Manhattan': manhattan,
    'Brooklyn': brooklyn,
    'Queens': queens,
    'Bronx': bronx,
    'Staten Island': staten
}


In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

pickup_datetime = {'yellow': 'tpep_pickup_datetime',
                    'green': 'lpep_pickup_datetime'}

dropoff_datetime = {'yellow': 'tpep_dropoff_datetime',
                    'green': 'lpep_dropoff_datetime'}


def time_delta(end,start):
    end = end.cast(LongType())
    start = start.cast(LongType())
    delta = (end-start)/60
    return delta

f = udf(time_delta, IntegerType())

months = []
median_speeds = []
for filename in filenames:
    boroughs_median_speed = {}
    print(filename)
    for borough, indexes in boroughs.items():
        print("   -{}".format(borough))
        # load the dataframe from csv
        trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                    header=True,
                                    inferSchema=True)\
                          .fillna({pickup_datetime[dataset]:0,
                                   dropoff_datetime[dataset]:0})\
                          .persist()

        # filter empty datetime
        trips = trips.filter(trips[pickup_datetime[dataset]].cast(LongType()) != 0) \
                     .filter(trips[dropoff_datetime[dataset]].cast(LongType()) != 0) \
        
        # filter borough
        trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(indexes)) \
                     .filter(trips.dolocationid.cast(IntegerType()).isin(indexes))

        # compute total time of the trips
        trips = trips.withColumn('time',time_delta(trips[dropoff_datetime[dataset]], 
                                                   trips[pickup_datetime[dataset]])) \
                          

        # compute speed
        trips = trips.withColumn('speed',trips['trip_distance']/trips['time']) 

        # compute median speed
        
        median_speed = trips.approxQuantile('speed', [0.5], 0)
        if len(median_speed) != 0:
            boroughs_median_speed[borough] = median_speed[0]
        else:
            boroughs_median_speed[borough] = None
        
    median_speeds.append(boroughs_median_speed)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': median_speeds
}

with open('plot_data/4_7/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [127]:
# plot the result

filenames = sorted(glob.glob("plot_data/4_7/*.json"))

n = len(filenames)
fig, ax = plt.subplots(n,1, figsize = (20,n*5))
fig.autofmt_xdate()

for i, filename in zip(range(n), filenames):
    with open(filename) as f:
        data = json.load(f)
        months = data['months']
        months = [datetime(year=int(month[0:4]), 
                           month=int(month[5:]), day=1) for month in months]
        series = data['values']

        series = pd.DataFrame(series, index=months)
        series.columns = columns=['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island']
            
        series.plot(style='-', ax=ax[i])
        ax[i].set_ylabel('speed (miles/minutes)')
        ax[i].set_title('{} dataset'.format(os.path.basename(filename)[:-5]))
        ax[i].tick_params(labelbottom=True)

fig.suptitle("Median monthly average trip speed (per borough)", fontsize=26)

plt.savefig('./figures/4_7.png')
plt.close()

<img src="figures/4_7.png" width="1000" align="left"/>

### 4.8.  How long does it take to get to a NYC airport ? 

In [None]:
# manhanttan midtown indexes
man_mid = [161, 162, 163, 164]

# get the index for each airport
zones = pd.read_csv('shape_files/zones.csv')

new = [loc_id for loc_id in 
      list(zones.loc[zones['Zone']=='Newark Airport', 'LocationID'])][0]
jfk = [loc_id for loc_id in 
       list(zones.loc[zones['Zone']=='JFK Airport', 'LocationID'])][0]
lg = [loc_id for loc_id in 
      list(zones.loc[zones['Zone']=='LaGuardia Airport', 'LocationID'])][0]

airports =  {
    'Newark Airport': new,
    'JFK Airport': jfk,
    'LaGuardia Airport': lg
}

# generate hours
hours = list(range(24))

In [None]:
dataset = "yellow"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

pickup_datetime = {'yellow': 'tpep_pickup_datetime',
                    'green': 'lpep_pickup_datetime'}

dropoff_datetime = {'yellow': 'tpep_dropoff_datetime',
                    'green': 'lpep_dropoff_datetime'}

average_tips = []
months = []

first = True
for filename in filenames:
    print(filename)
    # load the dataframe from csv
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True)\
                                .fillna({pickup_datetime[dataset]:0,
                                         dropoff_datetime[dataset]:0})
    
    # filter departur in manhattan midtown
    trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(man_mid))

    # filter empty datetime
    trips = trips.filter(trips[pickup_datetime[dataset]].cast(LongType()) != 0) \
                 .filter(trips[dropoff_datetime[dataset]].cast(LongType()) != 0) \
                 .persist()

    # union on all the files of the dataset
    if first:
        man_mid_trips = trips
        print("first")
        first = False

    else:
        man_mid_trips = man_mid_trips.union(trips)
        

median_times = {}
for airport, index in airports.items():
    print(airport)
    # filter arrival in teh current airport
    airport_trips = man_mid_trips.filter(man_mid_trips.dolocationid.cast(IntegerType()) == index)\
                                 .persist()

    median_times_by_hour = {}
    for h in hours:
        print("  -{}".format(h))
        # filter departur in the current hour
        h_trips = airport_trips.filter(hour(airport_trips[pickup_datetime[dataset]]) == h)
        
        # compute time of the trips
        trips = h_trips.withColumn('time',time_delta(h_trips[dropoff_datetime[dataset]], 
                                                     h_trips[pickup_datetime[dataset]])) \
        
        # compute the median time on the remaining rows
        median_time = trips.approxQuantile('time', [0.5], 0)
    
        if len(median_time) != 0:
            median_times_by_hour[str(h)] = median_time[0]
        else:
            median_times_by_hour[str(h)] = None
    
    median_times[airport] = median_times_by_hour
    
data = {
    'months': months,
    'values': median_times
}

with open('plot_data/4_8/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)


In [125]:
from pandas.plotting import table

# load data from json
with open('./plot_data/4_8/yellow.json') as f:
    data = json.load(f)
    data = data['values']

# build dataframe
columns = list(range(0,24))
rows = list(data.keys())
data = list(data.values())
df = [list(d.values()) for d in data]
df = pd.DataFrame(df, columns=columns, index=rows).T
df = df.applymap(lambda x: round(x,2))

# plot table
fig, ax = plt.subplots(1,1, figsize=(6,5))
ax.axis('off')
table(ax, df, loc='upper right')

plt.savefig('./figures/4_8.png')
plt.close()

<img src="figures/4_8.png" width="600" align="left"/>

This table shows the median travel times (in minutes) to the different airports from Manhattan midtown at all hours of the day  of the day (for example, 1 corresponds to the period 1:00:00 to 1:59:59). 

In [None]:
# Stop spark
try: 
    spark.stop()
except: 
    pass