In [1]:
# importing dependencies
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
# reading the csv file using pandas and creating a dataframe 
df_bike = pd.read_csv("resources/austin_B-Cycle_Trips.csv")

# displaying the top rows of the dataframe 
df_bike.head()

Unnamed: 0,Trip ID,Membership Type,Bicycle ID,Checkout Date,Checkout Time,Checkout Kiosk ID,Checkout Kiosk,Return Kiosk ID,Return Kiosk,Trip Duration Minutes,Month,Year
0,9900285854,Annual (San Antonio B-cycle),207.0,10/26/2014,13:12:00,2537.0,West & 6th St.,2707.0,Rainey St @ Cummings,76,10.0,2014.0
1,9900285855,24-Hour Kiosk (Austin B-cycle),969.0,10/26/2014,13:12:00,2498.0,Convention Center / 4th St. @ MetroRail,2566.0,Pfluger Bridge @ W 2nd Street,58,10.0,2014.0
2,9900285856,Annual Membership (Austin B-cycle),214.0,10/26/2014,13:12:00,2537.0,West & 6th St.,2496.0,8th & Congress,8,10.0,2014.0
3,9900285857,24-Hour Kiosk (Austin B-cycle),745.0,10/26/2014,13:12:00,,Zilker Park at Barton Springs & William Barton...,,Zilker Park at Barton Springs & William Barton...,28,10.0,2014.0
4,9900285858,24-Hour Kiosk (Austin B-cycle),164.0,10/26/2014,13:12:00,2538.0,Bullock Museum @ Congress & MLK,,Convention Center/ 3rd & Trinity,15,10.0,2014.0


In [3]:
## Checking for NA Values
df_bike.count()

Trip ID                  991271
Membership Type          984960
Bicycle ID               990548
Checkout Date            991271
Checkout Time            991271
Checkout Kiosk ID        968117
Checkout Kiosk           991271
Return Kiosk ID          966858
Return Kiosk             991271
Trip Duration Minutes    991271
Month                    618479
Year                     618479
dtype: int64

In [4]:
# Filling the na values 
df_bike_na = df_bike.fillna(0)
df_bike_na.count()

Trip ID                  991271
Membership Type          991271
Bicycle ID               991271
Checkout Date            991271
Checkout Time            991271
Checkout Kiosk ID        991271
Checkout Kiosk           991271
Return Kiosk ID          991271
Return Kiosk             991271
Trip Duration Minutes    991271
Month                    991271
Year                     991271
dtype: int64

In [5]:
## Checking the number of unlabeled bike

print(len(df_bike[df_bike_na['Bicycle ID'] == 0]))

723


In [6]:
## checking how many rows for months and year are missing
print(len(df_bike[df_bike_na['Year'] == 0]))
print(len(df_bike[df_bike_na['Month'] == 0]))

372792
372792


In [7]:
## Filling the month and year column by splitting the Checkout Date into Month Date and Year
df_bike_na['Checkout Date'] = pd.to_datetime(df_bike_na['Checkout Date']) 
df_bike_na.head(1)
df_bike_na['Year'] = df_bike_na['Checkout Date'].dt.year
df_bike_na['Month'] = df_bike_na['Checkout Date'].dt.month
#df_bike_na['Trip Month Name'] = df_bike_na['Checkout Date'].dt.month_name()
df_bike_na['Trip Date'] = df_bike_na['Checkout Date'].dt.day
df_bike_na.head(1)

df_bike_na['Trip Day of Week'] = df_bike_na['Checkout Date'].dt.weekday_name
df_bike_na.head(1) 

Unnamed: 0,Trip ID,Membership Type,Bicycle ID,Checkout Date,Checkout Time,Checkout Kiosk ID,Checkout Kiosk,Return Kiosk ID,Return Kiosk,Trip Duration Minutes,Month,Year,Trip Date,Trip Day of Week
0,9900285854,Annual (San Antonio B-cycle),207.0,2014-10-26,13:12:00,2537.0,West & 6th St.,2707.0,Rainey St @ Cummings,76,10,2014,26,Sunday


In [8]:
## Renaming the Columns Name
df_bike_na = df_bike_na.rename(columns = {"Checkout Kiosk ID":"Checkout Station ID","Checkout Kiosk":"Checkout Station",
                                          "Return Kiosk ID":"Return Station ID","Return Kiosk":"Return Station",
                                          "Month":"Trip Month","Year":"Trip Year"})
df_bike_na.head(1) 


Unnamed: 0,Trip ID,Membership Type,Bicycle ID,Checkout Date,Checkout Time,Checkout Station ID,Checkout Station,Return Station ID,Return Station,Trip Duration Minutes,Trip Month,Trip Year,Trip Date,Trip Day of Week
0,9900285854,Annual (San Antonio B-cycle),207.0,2014-10-26,13:12:00,2537.0,West & 6th St.,2707.0,Rainey St @ Cummings,76,10,2014,26,Sunday


In [9]:
## Creating a new column for Trip Hour
# Split the hour from the checkout time 
df_bike_na['Checkout Time'] = pd.to_datetime(df_bike_na['Checkout Time'])
df_bike_na['Trip Hour'] = df_bike_na['Checkout Time'].dt.hour
df_bike_na.head(1) 

Unnamed: 0,Trip ID,Membership Type,Bicycle ID,Checkout Date,Checkout Time,Checkout Station ID,Checkout Station,Return Station ID,Return Station,Trip Duration Minutes,Trip Month,Trip Year,Trip Date,Trip Day of Week,Trip Hour
0,9900285854,Annual (San Antonio B-cycle),207.0,2014-10-26,2018-08-29 13:12:00,2537.0,West & 6th St.,2707.0,Rainey St @ Cummings,76,10,2014,26,Sunday,13


In [10]:
## To check how many bikes were stolen
df_bike_stolen = df_bike_na.loc[df_bike_na["Return Station"] == "Stolen"]
number_bike_stolen = df_bike_stolen["Return Station"].count()
number_bike_stolen

23

In [11]:
## To check how many bikes were missing
df_bike_missing = df_bike_na.loc[df_bike_na["Return Station"] == "Missing"]
number_bike_missing = df_bike_missing["Return Station"].count()
number_bike_missing

25

In [12]:
## To check how many bikes have trip duration has zero minutes (faulty data)
df_bike_trip_minutes_zero = df_bike_na.loc[df_bike_na["Trip Duration Minutes"] == 0]
number_bike_trip_minutes_zero = df_bike_trip_minutes_zero["Trip ID"].count()
number_bike_trip_minutes_zero

19033

In [13]:
## To check how many Checkout Station ID are blank
df_bike_checkout_id_blank = df_bike_na.loc[df_bike_na["Checkout Station ID"] == 0]
number_df_bike_checkout_id_blank  = df_bike_checkout_id_blank["Trip ID"].count()
number_df_bike_checkout_id_blank

23154

In [14]:
## To find which check out station have blank checkout IDs
df_bike_checkout_id_blank = df_bike_na.loc[df_bike_na["Checkout Station ID"] == 0]
df_bike_checkout_id_blank["Checkout Station"].value_counts()

Zilker Park at Barton Springs & William Barton Drive    11534
Dean Keeton & Speedway                                   3825
ACC - West & 12th                                        2462
Convention Center/ 3rd & Trinity                         1292
Mobile Station                                           1183
East 11th Street at Victory Grill                        1030
Red River @ LBJ Library                                   584
Mobile Station @ Bike Fest                                516
Main Office                                               300
Bullock Museum @ Congress & MLK                           172
State Capitol @ 14th & Colorado                           111
MapJam at Pan Am Park                                      32
MapJam at French Legation                                  27
MapJam at Hops & Grain Brewery                             19
Repair Shop                                                15
MapJam at Scoot Inn                                        11
Shop    

In [15]:
## Number of station which have no checkout ID
len(df_bike_checkout_id_blank["Checkout Station"].value_counts())

24

In [16]:
## Number of Unique Checkout Station
df_bike_na["Checkout Station"].unique().size

104

In [17]:
## Number of station with unique checkout station ids other than zero
104-24

80

In [18]:
## Number of Unique Checkout Station ID
df_bike_na["Checkout Station ID"].unique().size

84

In [19]:
## list of checkout station id
checkout_station_id_list =  df_bike_na["Checkout Station ID"].value_counts().index
checkout_station_id_list

## we have 83 unique check out station ID excluding zero

Float64Index([3798.0, 2575.0, 2499.0, 2494.0, 2501.0, 2707.0, 2495.0, 2498.0,
              2563.0, 2497.0, 2566.0,    0.0, 2552.0, 2548.0, 2549.0, 2567.0,
              2574.0, 2711.0, 2502.0, 2503.0, 2547.0, 2570.0, 2539.0, 2572.0,
              2496.0, 2504.0, 3841.0, 2537.0, 3792.0, 2542.0, 3377.0, 2565.0,
              3390.0, 2571.0, 2538.0, 3793.0, 3838.0, 2550.0, 2569.0, 3794.0,
              2562.0, 3795.0, 3513.0, 2540.0, 3797.0, 2822.0, 2564.0, 3619.0,
              3621.0, 2561.0, 3799.0, 2536.0, 3455.0, 2544.0, 3292.0, 2568.0,
              2541.0, 3687.0, 1007.0, 1008.0, 3291.0, 3684.0, 3293.0, 3686.0,
              3660.0, 2712.0, 2823.0, 2576.0, 3294.0, 3685.0, 2546.0, 2545.0,
              3635.0, 1006.0, 1002.0, 3464.0, 3790.0, 1003.0, 3381.0, 2500.0,
              3791.0, 3456.0, 1005.0, 1001.0],
             dtype='float64')

In [20]:
## we have more number of unique checkout station ids than number of unique checkout stations
## this implies we have few checkout station with more than one checkout station IDs

# creating a dictionary using keyword arguments checkout station and checkout station ids 
# To check which station has more than one checkout ids
Checkout_station_id = dict()
for index, row in df_bike_na.iterrows():
    if row['Checkout Station'] not in Checkout_station_id:
        Checkout_station_id[row['Checkout Station']] = set()
    else:
         Checkout_station_id[row['Checkout Station']].add(row['Checkout Station ID'])
Checkout_station_id 

{'West & 6th St.': {2537.0},
 'Convention Center / 4th St. @ MetroRail': {2498.0},
 'Zilker Park at Barton Springs & William Barton Drive': {0.0},
 'Bullock Museum @ Congress & MLK': {0.0, 2538.0},
 '8th & Congress': {2496.0},
 'East 11th St. & San Marcos': {2569.0},
 'South Congress & Elizabeth': {2504.0},
 'Pfluger Bridge @ W 2nd Street': {2566.0},
 'Riverside @ S. Lamar': {2575.0},
 '2nd & Congress': {2494.0},
 'Convention Center/ 3rd & Trinity': {0.0},
 'East 6th at Robert Martinez': {2822.0},
 'East 6th & Pedernales St.': {2544.0},
 'Davis at Rainey Street': {2563.0},
 'UT West Mall @ Guadalupe': {2548.0},
 'East 11th Street at Victory Grill': {0.0},
 'Palmer Auditorium': {2567.0},
 'State Capitol Visitors Garage @ San Jacinto & 12th': {2561.0},
 'Rainey St @ Cummings': {2707.0},
 '5th & Bowie': {2501.0},
 'Long Center @ South 1st & Riverside': {2549.0},
 '17th & Guadalupe': {2540.0},
 'Red River & 8th Street': {2571.0},
 'Barton Springs Pool': {2572.0},
 'State Capitol @ 14th & C

In [21]:
## Checking Number of Unique Trip IDs
df_bike_na["Trip ID"].unique().size

## The data for Trip IDs is clean we can do analysis on it

991271

In [22]:
#checking the data types of columns
df_bike_na.dtypes

Trip ID                           int64
Membership Type                  object
Bicycle ID                      float64
Checkout Date            datetime64[ns]
Checkout Time            datetime64[ns]
Checkout Station ID             float64
Checkout Station                 object
Return Station ID               float64
Return Station                   object
Trip Duration Minutes             int64
Trip Month                        int64
Trip Year                         int64
Trip Date                         int64
Trip Day of Week                 object
Trip Hour                         int64
dtype: object

In [23]:
# converting the trip year,month,date data types from object to int
df_bike_na['Trip Year'] = df_bike_na["Trip Year"].astype(int)
df_bike_na['Trip Month'] = df_bike_na["Trip Month"].astype(int)
df_bike_na['Trip Date'] = df_bike_na["Trip Date"].astype(int)
df_bike_na['Bicycle ID'] = df_bike_na['Bicycle ID'].astype(int)
df_bike_na['Return Station ID'] = df_bike_na['Bicycle ID'].astype(int)
df_bike_na['Checkout Station ID'] = df_bike_na['Bicycle ID'].astype(int)
df_bike_na.dtypes

Trip ID                           int64
Membership Type                  object
Bicycle ID                        int32
Checkout Date            datetime64[ns]
Checkout Time            datetime64[ns]
Checkout Station ID               int32
Checkout Station                 object
Return Station ID                 int32
Return Station                   object
Trip Duration Minutes             int64
Trip Month                        int32
Trip Year                         int32
Trip Date                         int32
Trip Day of Week                 object
Trip Hour                         int64
dtype: object

In [None]:
## Need to write comments that even analyis on trip year date column is good 
#because those data are clean

In [None]:
## For popular stations analysis we are considering the top six checkout stations so adding zero 
# has no effect on the analysis we don't need to remove those rows. 
#(need to write more on this.....)