## 2.6 Building Dashboards with Streamlit

### Importing libraries and data

In [None]:
# Import libraries
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [None]:
# Import data
df = pd.read_csv('sampleset_citibike.csv', low_memory=False)

### Check data for any additional wrangling

In [None]:
df.dtypes

In [None]:
# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0'])

In [None]:
# Convert 'start_time' and 'end_time' to datetime
df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
df['end_time'] = pd.to_datetime(df['end_time'], errors='coerce')

In [None]:
# 7. Rename columns
df = df.rename(columns={'rideable_type': 'bike_type', 'member_casual': 'membership_type'})

In [None]:
df.head()

In [None]:
# Create the season column

df['season'] = [
"winter" if (start_month == 12 or 1 <= start_month <= 4)
    else "spring" if (4 < start_month <= 5)
    else "summer" if (6 <= start_month <= 9)
    else "fall"
for start_month in df['start_month']
    ]

In [None]:
print(df[['season', 'start_month']].head())

In [None]:
df.shape

In [None]:
# Getting rid of outliers in the ride_duration column
# Calculate Q1 and Q3
Q1 = df['ride_duration'].quantile(0.25)
Q3 = df['ride_duration'].quantile(0.75)

In [None]:
# Calculate IQR
IQR = Q3 - Q1

In [None]:
# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Filter out the outliers
df_filtered = df[(df['ride_duration'] >= lower_bound) & (df['ride_duration'] <= upper_bound)]

In [None]:
# Groupby the count of the rides 
df_group = pd.DataFrame(df.groupby(['date'])['ride_id'].count()).reset_index()

In [None]:
df_group.head()

In [None]:
df_group.rename(columns = {'ride_id':'bike_rides_daily'}, inplace = True)
df = df.merge(df_group, on = "date", how = 'outer', indicator = True)
print(df['_merge'].value_counts(dropna = False))
print("Shape of January 1st is", df[df['date'] == '2022-01-01'].shape) # Check 
print("Shape of January 2nd is", df[df['date'] == '2022-01-02'].shape) # Second check 

df_temp = df.set_index('date')

print(df_temp.columns)

In [None]:
df = df_temp

In [None]:
df.reset_index(inplace=True)

In [None]:
df.columns

In [None]:
# Drop the '_merge' column in place
df.drop(columns=['_merge'], inplace=True)

In [None]:
df.to_csv('dataset_wrangledsample.csv')

### Create the Plotly charts

In [None]:
# Groupby
df['value'] = 1 
df_groupby_bar = df.groupby('start_station_name', as_index=False).agg({'value': 'sum'})
top20 = df_groupby_bar.nlargest(20, 'value')

In [None]:
fig = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value']))
fig.show()

In [None]:
fig = go.Figure(go.Bar(x = top20['start_station_name'], y = top20['value'], marker={'color': top20['value'],'colorscale': 'Blues'}))
fig.show()

In [None]:
fig.update_layout(
    title='Top 20 most popular bike stations in New York',
    xaxis_title='Start stations',
    yaxis_title='Sum of trips',
    width=900, height=600,
    plot_bgcolor='rgba(0,0,0,0)', 
    paper_bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.update_traces(marker=dict(colorscale='Purples')) 

fig.show()

In [None]:
df.columns

In [None]:
# Setting up line chart code in seprate executions as to not overflow memory
def setup_plot():
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    return fig

In [None]:
# Adding Bike Rides Trace
def add_bike_rides_trace(fig, df):
    fig.add_trace(
        go.Scatter(
            x=df['date'], 
            y=df['bike_rides_daily'], 
            name='Daily bike rides',
            line=dict(color='blue')  # Set the color for the bike rides trace
        ),
        secondary_y=False
    )

In [None]:
# Adding Temperature Trace
def add_temperature_trace(fig, df):
    fig.add_trace(
        go.Scatter(
            x=df['date'], 
            y=df['avgTemp'], 
            name='Daily temperature',
            line=dict(color='red')  # Set the color for the temperature trace
        ),
        secondary_y=True
    )

In [None]:
# Finalizing Plot
def finalize_plot(fig):
    fig.update_layout(
        title='Daily Bike Rides and Temperature',
        xaxis_title='Date',
        yaxis_title='Bike Rides',
        yaxis2_title='Temperature',
        width=900,
        height=600
    )
    fig.show()

In [None]:
# Main function to execute all chunks
def main():
    fig = setup_plot()  
    add_bike_rides_trace(fig, df) 
    add_temperature_trace(fig, df)  
    finalize_plot(fig) 

In [None]:
main()

In [None]:
import gc # this is a garbage collector
gc.collect()

In [None]:
# Save the top 20 stations as a csv file 

top20.to_csv('top20.csv')