In [None]:
import pandas as pd 
from bokeh.plotting import figure, output_notebook, show
from sklearn.model_selection import train_test_split
from bokeh.models import NumeralTickFormatter

output_notebook()f

In [None]:
df=pd.read_csv('yellow_trip_trimmed.csv')

In [None]:
df.shape

In [None]:
df['pickup_day']=df['tpep_pickup_datetime'].dt.day

In [None]:
day_count = df['pickup_day'].value_counts().reset_index()
day_count.columns=['day','count']
day_count = day_count.sort_values(by='day').reset_index(drop=True)

In [None]:
from bokeh.plotting import figure, show

# prepare some data
x = day_count['day']
y = day_count['count']

# create a new plot with a title and axis labels
p = figure(width=800, height=400,title="No of Trips by day", x_axis_label="Days Of The Month", y_axis_label="Count Of Trips")

# add multiple renderers
p.line(x, y, legend_label="Trip", color="blue", line_width=2)
p.square(x, y, legend_label="day", size=5, line_color="green")
#formatting the x-axis ticks as normal numbers
p.xaxis.formatter = NumeralTickFormatter(format='0,0')
p.legend.location = "top_left"

# show the results
show(p)

![Alt text](line1.png)

In [None]:
time_tab = df['tpep_pickup_datetime'].dt.hour.to_frame()
time_tab2 = time_tab.value_counts().sort_index().to_frame()
#time_tab = time_tab.columns=['pickup_time','count']
time_tab2.reset_index(inplace=True)
time_tab2.columns=['hour','count']

In [None]:
x=time_tab2['hour']
y=time_tab2['count']
x=x.astype(str)

In [None]:
from bokeh.plotting import figure, show
    
fruits = x
counts = y

p = figure(x_range=fruits, height=300,width=750, title="Count Of Trips By Hour",
           toolbar_location=None, tools="")

p.vbar(x=fruits, top=counts, width=0.9)

#formatting the x-axis ticks as normal numbers
p.xaxis.formatter = NumeralTickFormatter(format='0,0')

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)


![Alt text](hist.png)

In [None]:
df['trip_month'] = df['tpep_pickup_datetime'].dt.month_name()

In [None]:
from bokeh.models import ColumnDataSource, Whisker
from bokeh.transform import factor_cmap

df3 = df['fare_amount'].to_frame()
df3['kind']='fare_amount'

qs = df3.groupby("kind").fare_amount.quantile([0.25, 0.5, 0.75])
qs = qs.unstack().reset_index()
df3 = pd.merge(df3, qs, on="kind", how="left")
df3.columns=['fare_amount','kind','q1','q2','q3'] 
kinds = df3.kind.unique()
# compute IQR outlier bounds
iqr = df3.q3 - df3.q1
df3["upper"] = df3.q3 + 1.5*iqr
df3["lower"] = df3.q1 - 1.5*iqr

source = ColumnDataSource(df3)

p = figure(x_range=kinds, tools="", toolbar_location=None,
           title="Fare distribution",
           background_fill_color="#eaefef", y_axis_label="Fare Amount")

# outlier range
whisker = Whisker(base="kind", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)

# quantile boxes
p.vbar("kind", 0.1, "q2", "q3", source=source, color='blue', line_color="black")
p.vbar("kind", 0.1, "q1", "q2", source=source, color='black', line_color="black")

# outliers
outliers = df3[~df3.fare_amount.between(df3.lower, df3.upper)]
p.scatter("kind", "fare_amount", source=outliers, size=6, color="red", alpha=0.3)
#formatting the x-axis ticks as normal numbers
p.yaxis.formatter = NumeralTickFormatter(format='0,0')

p.xgrid.grid_line_color = None
p.axis.major_label_text_font_size="14px"
p.axis.axis_label_text_font_size="12px"

show(p)

![Alt text](boxplot.png)

In [None]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda x: x.total_seconds())

In [None]:
Vendor_count = df.VendorID.value_counts().to_frame().reset_index()
Vendor_count.columns = ['vendors','counts']
Vendor_count.vendors = Vendor_count.vendors.astype(str)

In [None]:
df['pickup_hour']=df.tpep_pickup_datetime.dt.hour

In [None]:
def period(hour):
    if 0 < hour < 7:
        return "wee hours"
    elif 6 < hour < 12:
        return "morning"
    elif 11 < hour < 16:
        return "afternoon"
    elif 15 < hour < 20:
        return "evening"
    else:
        return "night"
    

In [None]:
df['pickup_time_period'] = df['pickup_hour'].apply(period)

In [None]:
time_period_count = df['pickup_time_period'].value_counts().to_frame().reset_index()
time_period_count.columns = ['time_period','counts']
time_period_count

In [None]:
from math import pi
import numpy as np
import pandas as pd
from bokeh.palettes import Category20c
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LabelSet, Legend, Plot
from bokeh.transform import cumsum

time_period_count['angle'] = time_period_count['counts'] / time_period_count['counts'].sum() * 2 * pi
time_period_count['color'] = ["seagreen","tomato","orchid","firebrick","skyblue"]
time_period_count['percentage'] = time_period_count['counts'] / time_period_count['counts'].sum() * 100
time_period_count['percentage'] = time_period_count['percentage'].apply(lambda x: str(round(x, 2)) + '%')
radius = 0.45

# Projection on X and Y axis for label positioning
#time_period_count['label_x_pos'] = np.cos(time_period_count['angle'].cumsum()-time_period_count['angle'].div(2))*3*radius/5
#time_period_count['label_y_pos'] = np.sin(time_period_count['angle'].cumsum()-time_period_count['angle'].div(2))*3*radius/5

TOOLTIPS = [('Category', '@time_period'), ('Value', '@counts'), ('Percentage', '@percentage')]

fig = figure(
             width=500 ,
             height=550 ,
             tools='hover', tooltips=TOOLTIPS, x_range=(-0.5, 1.0))
source = ColumnDataSource(time_period_count)

fig.wedge(x=0, y=1, radius=0.45, start_angle=cumsum('angle', include_zero=True),
          end_angle=cumsum('angle'), line_color='white', fill_color='color',
          legend_field='time_period', source=source)

fig.title.text_font_size = '12pt'



labels = LabelSet(x=0, y=1, text='percentage', level='glyph', angle=cumsum('angle', include_zero=True),
                  source=source)
fig.title.text = "Percentage Of Trips By Period Of Day"
fig.add_layout(labels)

fig.axis.axis_label = None
fig.axis.visible = False
fig.grid.grid_line_color = None

show(fig)

![Alt text](pie.png)

In [None]:
df['day']=df['tpep_pickup_datetime'].dt.day_name()

In [None]:
import pandas as pd

from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg2 import autompg2
from bokeh.transform import factor_cmap

time_tab3 = df[['day','pickup_hour']]


kinds = time_tab3.day.unique()

# compute quantiles
qs = time_tab3.groupby("day").pickup_hour.quantile([0.25, 0.5, 0.75])
qs = qs.unstack().reset_index()
qs.columns = ["day", "q1", "q2", "q3"]
time_tab3 = pd.merge(time_tab3, qs, on="day", how="left")

# compute IQR outlier bounds
iqr = time_tab3.q3 - time_tab3.q1
time_tab3["upper"] = time_tab3.q3 + 1.5*iqr
time_tab3["lower"] = time_tab3.q1 - 1.5*iqr

source = ColumnDataSource(time_tab3)

p = figure(x_range=kinds, tools="", toolbar_location=None,
           title="pickup time distribution by days of the week",
           background_fill_color="#eaefef",width=850, y_axis_label="time of day")

# outlier range
whisker = Whisker(base="day", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)

# quantile boxes
cmap = factor_cmap("day", "TolRainbow7", kinds)
p.vbar("day", 0.7, "q2", "q3", source=source, color=cmap, line_color="black")
p.vbar("day", 0.7, "q1", "q2", source=source, color=cmap, line_color="black")

# outliers
outliers = time_tab3[~time_tab3.pickup_hour.between(time_tab3.lower, time_tab3.upper)]
p.scatter("day", "pickup_hour", source=outliers, size=6, color="black", alpha=0.3)

p.xgrid.grid_line_color = None
p.axis.major_label_text_font_size="14px"
p.axis.axis_label_text_font_size="12px"

show(p)

![Alt text](box_p.png)

In [None]:
time_tab3.rename(columns={"class": "kind"})

In [None]:
def drop_outlier(data,var):
    q1, q3 = np.percentile(data[var], [25, 75])
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    data = data[data[var]< upper]
    data = data[data[var]> lower]
    data.reset_index(drop=True, inplace = True)
    return data


In [None]:
df = drop_outlier(df,'duration')

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
import pandas as pd

# Create a sample dataset
data = {'Duration': [30, 45, 60, 120, 90, 75, 30, 45, 60, 90, 120, 60, 75]}

# Convert the data to a Pandas DataFrame
df1 = pd.DataFrame(data)

# Create a histogram plot using Bokeh
output_notebook()
source = ColumnDataSource(df)
hist = figure(title='Trip Duration Histogram', 
              x_axis_label='Duration (minutes)', 
              y_axis_label='Count', 
              tools='hover, box_select')
hist.quad(top='top', bottom=0, 
          source=source, fill_color='navy', line_color='white')
hist.add_tools(HoverTool(tooltips=[('Duration', '@left - @right mins'), ('Count', '@top')]))

# Show the plot
show(hist)
