# Bike Share Analysis in Python
I learned a lot working on the bike share data using R for the capstone project for my Google Data Analytics certificate. I want to do the same project again but with python to experience the struggle, frustration, and sucess all over again. 



In [None]:
# Setup envionment and import libraries
from matplotlib import pyplot as plt 
import pandas as pd
import seaborn as sns
import numpy as np
import glob
from pandas.api.types import CategoricalDtype

# Full output for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Read and merge CSV files to dataframe
df = pd.concat(map(pd.read_csv, glob.glob("data/*.csv")))

In [None]:
df

In [None]:
df.info()

In [None]:
# Convert started_at and ended_at columns to DateTime format
df['started_at'] = pd.to_datetime(df['started_at'], format = '%Y-%m-%d %H:%M:%S')
df['ended_at'] = pd.to_datetime(df['ended_at'], format = '%Y-%m-%d %H:%M:%S')


In [None]:
df1 = df
# Add columns for year, month, day, and day of the week. 
df1['year'] = df1['started_at'].dt.year
df1['month'] = df1['started_at'].dt.month
df1['day'] = df1['started_at'].dt.day
df1['day_of_week'] = df1['started_at'].dt.day_name()

# Order days of the week
cats = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
cat_type = CategoricalDtype(categories = cats, ordered = True)
df1['day_of_week'] = df1['day_of_week'].astype(cat_type)


In [None]:
# Add column for ride length
df1['ride_length'] = df1['ended_at'] - df1['started_at']
# Convert DateTime format to seconds (int)
df1['ride_length'] = df1['ride_length'].dt.total_seconds()

In [None]:
# Convert DateTime to seconds
df1['ride_length'].dt.seconds


In [None]:
# Remove irrelevant columns and missing rows
df1.drop(['start_lat', 'start_lng', 'end_lat', 'end_lng'], axis = 1, inplace = True)
df1.dropna(subset = ['start_station_name', 'end_station_name'])


In [None]:
# Check for duplicate rows
df_dupes = df1[df1.duplicated(['ride_id'])]
print(df_dupes)

In [None]:
#Check for inconsistent data (i.e. more than 2 member types)
df1['member_casual'].value_counts()
df1['rideable_type'].value_counts()

In [None]:
# Check data is within date range
df1['started_at'].max()
df1['started_at'].min()

In [None]:
# Check for negative ride durations
negative_ride_length = df1[(df1['ride_length'].dt.total_seconds() < 0)]
print(negative_ride_length)

In [None]:
# Drop rows with docked_bike or negative ride length
df1 = df1[(df1['rideable_type'] != 'docked_bike') & (df1['ride_length'].dt.total_seconds() > 0)]

In [None]:
df1['ride_length'].describe()

In [None]:
# Group by day of week and member type, then aggregate average ride length
df2 = df1.groupby(['day_of_week', 'member_casual'])['ride_length'].mean()
print(df2)

In [None]:
#Plot average ride length vs day of the week
plt.figure(figsize=[15, 14])
sns.barplot(data = df1, x = 'day_of_week', y = 'ride_length', hue = 'member_casual')
plt.title('Average Ride Length vs. Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Ride Length (seconds)')

In [None]:
#Plot number of rides vs. day of the week
plt.figure(figsize=[15, 14])
sns.countplot(data = df1, x = 'day_of_week', hue = 'member_casual')