In [3]:
import os
import glob
import json
import pandas as pd
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, LongType

# Analysis

In this part, we will retrieve, process and display the data to be able to draw some conclusions. Most of these analyses concern monthly evolutions of certain quantities (total number of trip, ...). In order not to have to rewrite code to plot the data, we will write a function.

In [None]:
def plot_monthly(data_dir, data_label, title):
    filenames = sorted(glob.glob("plot_data/{}/*.json".format(data_dir)))
    
    n = len(filenames)
    fig, ax = plt.subplots(n,1, figsize = (20,n*8))
    fig.autofmt_xdate()
    
    for i, filename in zip(range(n), filenames):
        with open(filename) as f:
            data = json.load(f)
            months = data['months']
            months = [datetime(year=int(month[0:4]), 
                               month=int(month[5:]), day=1) for month in months]
            values = data['values']
            
            series = pd.Series(values, index=months)
            if n > 1:
                a = ax[i]
            else:
                a = ax
            series.plot(style='-', ax=a)
            a.set_ylabel(data_label)
            a.set_title('{} dataset'.format(os.path.basename(filename)[:-5]))
            a.tick_params(labelbottom=True)
            
    plt.savefig('figures/{}.png'.format(data_dir))
    plt.close()


In [4]:
# get all the filename
hdfs_path = 'hdfs://public00:8020/user/hpda000034/infoh600/clean'
local_path = '/home/hpda00034/infoh600/sampled'

In [5]:
os.environ['HADOOP_CONF_DIR']="/etc/hadoop/conf"

# python configuration
os.environ['PYSPARK_PYTHON']="/usr/local/anaconda3/bin/python"
os.environ['PYSPARK_DRIVER_PYTHON']="/usr/local/anaconda3/bin/python"

from pyspark.sql import SparkSession
from pyspark import SparkFiles, SQLContext


# remove old spark session
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session, with YARN as resource manager, requesting 4 worker nodes.
spark = SparkSession \
    .builder \
    .master("yarn") \
    .config("spark.executor.instances","4") \
    .appName("project_ceci18") \
    .getOrCreate()

# Create spark context
sc=spark.sparkContext
sqlContext = SQLContext(sc)

### 4.1. Monthly total number of trips (per dataset type)

In [None]:
dataset = "fhvhv"

filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

n_trips = []
months = []
for filename in filenames:
    print(filename)
    df = sqlContext.read.csv("./integrated/{}/{}".format(dataset, filename), header=True)
    count = df.count()
    n_trips.append(count)
    months.append(filename[-11:-4])

data = {
    'months': months,
    'values': n_trips
}
print(data)

with open('plot_data/4_1/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_1', 'number of trips', 'Monthly total number of trips')

<img src="figures/4_1.png" width="1000" align="left"/>

### 4.2. Monthly total number of trips in Manhattan and Brooklyn (per dataset type)

In [None]:
zones = pd.read_csv('shape_files/zones.csv')
zones.head()

man = [loc_id-1 for loc_id in 
       list(zones.loc[zones['Borough']=='Manhattan', 'LocationID'])]
bro = [loc_id-1 for loc_id in 
       list(zones.loc[zones['Borough']=='Brooklyn', 'LocationID'])]

In [1]:
dataset = "fhvhv"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

n_trips = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./integrated/{}/{}".format(dataset, filename), header=True).persist()
    man_trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(man)) \
                     .filter(trips.dolocationid.cast(IntegerType()).isin(man))
    bro_trips = trips.filter(trips.pulocationid.cast(IntegerType()).isin(bro)) \
                     .filter(trips.dolocationid.cast(IntegerType()).isin(bro))
    count_man = man_trips.count()
    count_bro = bro_trips.count()
    n_trips.append(count_man + count_bro)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': n_trips
}

with open('plot_data/4_2/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

NameError: name 'glob' is not defined

In [None]:
plot_monthly('4_2', 'number of trips', 'Monthly number of trips (Man et Bro)')

<img src="figures/4_2.png" width="1000" align="left"/>

### 4.3. Monthly total receipts (per dataset type)

In [None]:
receipt_column = ['fare_amount', 'extra', 'mta_tax', 'tolls_amount', 'ehail_fee']

In [None]:
dataset = "green"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

total_receipts = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).fillna({'fare_amount':0, 'mta_tax':0, 
                                                          'extra':0, 'tolls_amount':0,
                                                          'ehail_fee': 0}).persist()
    trips = trips.withColumn('receipt',sum(trips[x] for x in receipt_column))
    total_receipt = trips.agg(F.sum('receipt')).collect()[0][0]
    total_receipts.append(total_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': total_receipts
}

with open('plot_data/4_3/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_3', 'total receipt', 'Monthly total receipts')

<img src="figures/4_3.png" width="1000" align="left"/>

### 4.4. Monthly average receipt (per dataset type)

In [None]:
dataset = "green"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]


average_receipts = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).fillna({'fare_amount':0, 'mta_tax':0, 
                                                          'extra':0, 'tolls_amount':0,
                                                          'ehail_fee': 0}).persist()
    total_receipt = trips.withColumn('receipt',sum(trips[x] for x in receipt_column)) \
                         .agg(F.sum('receipt')).collect()[0][0]
    n_trips = trips.count()
    average_receipt = total_receipt/n_trips
    average_receipts.append(average_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_receipts
}

with open('plot_data/4_4/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_4', 'average receipt', 'Monthly total receipts')

<img src="figures/4_4.png" width="1000" align="left"/>

### 4.5. Monthly average cost per in-progress-minute (per dataset type)

In [None]:
receipt_column = ['fare_amount', 'extra', 'mta_tax', 'tolls_amount', 'ehail_fee']

dataset = "green"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

def time_delta(end,start):
    end = end.cast(LongType())
    start = start.cast(LongType())
    delta = (end-start)/60
    return delta

f = udf(time_delta, IntegerType())


average_receipts = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).fillna({'fare_amount':0, 'mta_tax':0, 
                                                          'extra':0, 'tolls_amount':0,
                                                          'ehail_fee': 0}).persist()
    trips = trips.filter(trips.lpep_dropoff_datetime.cast(LongType()) != 0) \
                 .filter(trips.lpep_pickup_datetime.cast(LongType()) != 0) \
                 .persist()
    total_time = trips.withColumn('time',time_delta(trips['lpep_dropoff_datetime'], 
                                                    trips['lpep_pickup_datetime'])) \
                      .agg(F.sum('time')).collect()[0][0]
    total_receipt = trips.withColumn('receipt',sum(trips[x] for x in receipt_column)) \
                         .agg(F.sum('receipt')).collect()[0][0]
    average_receipt = total_receipt/total_time
    average_receipts.append(average_receipt)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_receipts
}

with open('plot_data/4_5/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_5', 'cost per minutes', 'Average cost per in progress minute')

<img src="figures/4_5.png" width="1000" align="left"/>

### 4.6. Monthly average tip (per dataset type)

In [None]:
dataset = "green"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]

average_tips = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True).fillna({'tip_amount':0}).persist()
    total_tip = trips.agg(F.sum('tip_amount')).collect()[0][0]
    n_trips = trips.count()
    average_tip = total_tip/n_trips
    average_tips.append(average_tip)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': average_tips
}

with open('plot_data/4_6/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [None]:
plot_monthly('4_6', 'total tip', 'Average tip')

<img src="figures/4_6.png" width="1000" align="left"/>

### 4.7. Median monthly average trip speed (per borough)

In [None]:
dataset = "green"
filenames = sorted(glob.glob("{}/{}_*.csv".format(local_path, dataset)))
filenames = [os.path.basename(filename) for filename in filenames]


def time_delta(end,start):
    end = end.cast(LongType())
    start = start.cast(LongType())
    delta = (end-start)/60
    return delta

f = udf(time_delta, IntegerType())


median_speeds = []
months = []
for filename in filenames:
    print(filename)
    trips = sqlContext.read.csv("./clean/{}/{}".format(dataset, filename), 
                                header=True,
                                inferSchema=True)\
                      .na.fill(0)
    trips = trips.filter(trips.lpep_dropoff_datetime.cast(LongType()) != 0) \
                 .filter(trips.lpep_pickup_datetime.cast(LongType()) != 0) \
                 .withColumn('time',time_delta(trips['lpep_dropoff_datetime'], 
                                                    trips['lpep_pickup_datetime'])) 

    trips = trips.withColumn('speed',trips['trip_distance']/trips['time']) 
    median_speed = trips.approxQuantile('speed', [0.5], 0)[0]
    median_speeds.append(median_speed)
    months.append(filename[-11:-4])
    
data = {
    'months': months,
    'values': median_speeds
}

with open('plot_data/4_7/{}.json'.format(dataset), 'w') as f:
    json.dump(data, f)

In [74]:
plot_monthly('4_7', 'speed', 'Average speed')

### 4.8.  How long does it take to get to a NYC airport ? 

In [6]:
# Stop spark
try: 
    spark.stop()
except: 
    pass