### Exploration of NYC Dataset Using Bokeh By Kehinde Adebisi

Dataset Used


http://localhost:8888/edit/Downloads/yellow_tripdata_2022-01.parquet

Importing the needed libraries

In [1]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category20c
from bokeh.transform import cumsum
from bokeh.io import curdoc
from bokeh.transform import factor_cmap
import pyarrow.parquet as pq
import pandas as pd
import datetime as dt
from math import pi

reading the file which is in parquet format

In [2]:
trips = pq.read_table('yellow_tripdata_2022-01.parquet')

In [3]:
trips = trips.to_pandas()

## Data Exploration
exploring the data, its shape, columns, data types, null values

In [4]:
trips.shape

(2463931, 19)

In [5]:
trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [6]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463931 entries, 0 to 2463930
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [7]:
#null values
trips.isnull().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71503
trip_distance                0
RatecodeID               71503
store_and_fwd_flag       71503
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71503
airport_fee              71503
dtype: int64

Decided to do a forward fill since this is just a quick exploration of the data

In [8]:
trips =trips.fillna('ffill')

In [9]:
#to confirm the null values have been removed
trips.isnull().sum()

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64

## Visualizations Using Bokeh

### Visualizing the Categorical Data

Exploring the Payment Types Using BarChart

To get the x_range and top, value counts is calculated, converted to a dataframe then split into 2, the payment type as x_range, the values as the top

In [10]:
payment_count = trips['payment_type'].value_counts().sort_values()

In [11]:
payment_count_df = pd.DataFrame(payment_count, columns = ['Payment Type', 'Counts'])

In [12]:
#Plotting the barchart
p = figure(x_range=payment_count_df['Payment Type'], height=350, title="Counts of Payment Types",
           toolbar_location=None, tools="")

p.vbar(x=payment_count_df['Payment Type'], top=payment_count_df['Counts'], width=1, color = "purple")

p.xgrid.grid_line_color = None
p.y_range.start = 0

In [13]:
show(p)

### Visualizing the Amount made Overtime using Line Chart

Grouping the datetime by the total sum of fare_amount

In [14]:
trips_2 = trips.groupby(trips['tpep_pickup_datetime'].dt.date)['fare_amount'].sum()

In [15]:
trips_2.head()

tpep_pickup_datetime
2008-12-31       106.00
2009-01-01       154.50
2021-12-31       294.50
2022-01-01    927316.08
2022-01-02    934810.76
Name: fare_amount, dtype: float64

In [16]:
#Convert to df inorder to be able to parse the column
trips_df = pd.DataFrame (trips_2, columns = ['Date', 'Amount'])

In [17]:
p = figure(title="Amount Made Each Day", x_axis_label='Total Amount', y_axis_label='Date')

In [18]:
#Adding a theme
curdoc().theme = 'light_minimal'

In [19]:
p.line(trips_df['Date'], trips_df['Amount'], legend_label="Temp.", line_width=2)

In [20]:
show(p)

### Visualizing the Store and Foward Flag Using Pie Chart

In [21]:
flag = trips['store_and_fwd_flag'].value_counts()

In [22]:
data = pd.Series(flag).reset_index(name='value').rename(columns={'index': 'flag'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = data['value'].count()

p = figure(height=350, title="Pie Chart", toolbar_location=None,
           tools="hover", tooltips="@flag: @value", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='flag', source=data)

p.axis.axis_label = None
p.axis.visible = False
p.grid.grid_line_color = None

show(p)

### Visualizing the Trip Distance By Amount Using Bar Chart

In [23]:
trips_3 = trips.groupby(trips['trip_distance'])['fare_amount'].sum()
trips_3

trip_distance
0.00         681737.60
0.01          23743.42
0.02          12831.01
0.03          10851.93
0.04           9220.26
               ...    
193150.52        29.98
201283.16        11.03
250984.47        28.58
274658.81        13.74
306159.28        14.96
Name: fare_amount, Length: 4305, dtype: float64

In [24]:
trips3_df = pd.DataFrame (trips_3, columns = ['Distance', 'Amount'])

In [25]:
p = figure(x_range=trips3_df['Distance'], height=350, title="Trip Distance by Fare Amount",
           toolbar_location=None, tools="")

p.vbar(x=trips3_df['Distance'], top=trips3_df['Amount'], width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)