# April 2018 vs. April 2019 Map Data

Both April 2018 and April 2019 Datasets include the following information: 

Trip Duration (seconds), 
Start Time and Date, 
Stop Time and Date, 
Start Station Name, 
End Station Name, 
Station ID, 
Station Lat/Long, 
Bike ID, 
User Type (Customer = 24-hour pass or 3-day pass user; Subscriber = Annual Member), 
Gender (Zero=unknown; 1=male; 2=female), 
Year of Birth

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# Import April 2018 & April 2019 data
april_2018_csv = ("Resources/JC-201904-citibike-tripdata.csv")
april_2019_csv = ("Resources/JC-201804-citibike-tripdata.csv")

In [3]:
# Convert to dataframes
april_2018_df = pd.read_csv(april_2018_csv)
april_2019_df = pd.read_csv(april_2019_csv)

In [4]:
# Confirm no null values
april_2018_df.count()

tripduration               33056
starttime                  33056
stoptime                   33056
start station id           33056
start station name         33056
start station latitude     33056
start station longitude    33056
end station id             33056
end station name           33056
end station latitude       33056
end station longitude      33056
bikeid                     33056
usertype                   33056
birth year                 33056
gender                     33056
dtype: int64

In [5]:
# Confirm no null values
april_2019_df.count()

tripduration               23634
starttime                  23634
stoptime                   23634
start station id           23634
start station name         23634
start station latitude     23634
start station longitude    23634
end station id             23634
end station name           23634
end station latitude       23634
end station longitude      23634
bikeid                     23634
usertype                   23634
birth year                 23634
gender                     23634
dtype: int64

In [6]:
# Describe 2018 data
april_2018_df.describe()

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,birth year,gender
count,33056.0,33056.0,33056.0,33056.0,33056.0,33056.0,33056.0,33056.0,33056.0,33056.0
mean,713.9957,3277.874667,40.722867,-74.046403,3271.563166,40.722414,-74.045908,28477.660576,1980.998064,1.148233
std,7526.086,151.301819,0.007244,0.01087,157.57411,0.007081,0.01091,1530.401023,10.207287,0.503074
min,61.0,3183.0,40.709651,-74.083639,224.0,40.709651,-74.083639,20543.0,1888.0,0.0
25%,224.0,3192.0,40.718355,-74.050656,3187.0,40.717733,-74.050389,26288.0,1975.0,1.0
50%,332.0,3207.0,40.721525,-74.043845,3205.0,40.721124,-74.043117,29285.0,1984.0,1.0
75%,558.0,3275.0,40.727224,-74.038051,3273.0,40.727224,-74.038051,29540.0,1989.0,1.0
max,1047271.0,3694.0,40.748716,-74.032108,3694.0,40.813358,-73.956461,37987.0,2003.0,2.0


In [7]:
# Describe 2019 data
april_2019_df.describe()

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,birth year,gender
count,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0
mean,669.285098,3249.767411,40.722664,-74.045977,3245.752137,40.72229,-74.045539,29487.990268,1979.570915,1.130109
std,5017.25552,119.369448,0.007265,0.010689,123.465209,0.007094,0.010702,2754.82975,10.261424,0.504996
min,61.0,3183.0,40.69264,-74.096937,327.0,40.679331,-74.096937,15271.0,1888.0,0.0
25%,221.0,3187.0,40.718211,-74.050389,3186.0,40.717733,-74.049968,26279.25,1972.0,1.0
50%,326.0,3203.0,40.721525,-74.043845,3203.0,40.721124,-74.043117,29524.0,1982.0,1.0
75%,548.0,3269.0,40.727224,-74.038051,3269.0,40.727224,-74.038051,31811.0,1988.0,1.0
max,571578.0,3681.0,40.748716,-74.032108,3681.0,40.760875,-73.975195,33670.0,2002.0,2.0


In [8]:
# Get datatypes for 2018 dataframe
april_2018_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

In [9]:
# Get datatypes for 2019 dataframe
april_2019_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

## Create 2018 Start Stations Dataframe

In [10]:
# Create Start Stations 2018 dataframe
start_stations_2018_df = april_2018_df[["start station id", "start station name"]].copy()

# Organize and calculate total number of starts for each unique station
start_stations_2018_df = start_stations_2018_df.groupby("start station name").count()
start_stations_2018_df = start_stations_2018_df.sort_values(by = "start station id", ascending = False)

# Rename column Number of Starts column and reset index
start_stations_2018_df = start_stations_2018_df.rename(columns = {"start station id": "Number of Starts 2018"})
start_stations_2018_df = start_stations_2018_df.reset_index()

In [11]:
# Rename Station column
start_stations_2018_df = start_stations_2018_df.rename(columns = {"start station name": "Station Name"})

# Calculate Percentage of Total Starts 2018
start_stations_2018_df["Percentage of Total Starts 2018"] = round(start_stations_2018_df["Number of Starts 2018"]/start_stations_2018_df["Station Name"].count(),2)

start_stations_2018_df.head()

Unnamed: 0,Station Name,Number of Starts 2018,Percentage of Total Starts 2018
0,Grove St PATH,3728,73.1
1,Hamilton Park,1754,34.39
2,Exchange Place,1571,30.8
3,Sip Ave,1509,29.59
4,Newport PATH,1458,28.59


## Create 2019 Start Station Dataframe

In [12]:
# Create Start Stations 2019 dataframe
start_stations_2019_df = april_2019_df[["start station id", "start station name"]].copy()

# Organize and calculate total number of starts for each unique station
start_stations_2019_df = start_stations_2019_df.groupby("start station name").count()
start_stations_2019_df = start_stations_2019_df.sort_values(by = "start station id", ascending = False)

# Rename column Number of Starts column and reset index
start_stations_2019_df = start_stations_2019_df.rename(columns = {"start station id": "Number of Starts 2019"})
start_stations_2019_df = start_stations_2019_df.reset_index()

In [13]:
# Rename Station column
start_stations_2019_df = start_stations_2019_df.rename(columns = {"start station name": "Station Name"})

# Calculate Percentage of Total Starts 2019
start_stations_2019_df["Percentage of Total Starts 2019"] = round(start_stations_2019_df["Number of Starts 2019"]/start_stations_2019_df["Station Name"].count(),2)

start_stations_2019_df.head()

Unnamed: 0,Station Name,Number of Starts 2019,Percentage of Total Starts 2019
0,Grove St PATH,2876,49.59
1,Hamilton Park,1424,24.55
2,Exchange Place,1276,22.0
3,Sip Ave,1235,21.29
4,Newport PATH,1116,19.24


## Create 2018 End Stations Dataframe

In [14]:
# Create End Stations 2018 dataframe
end_stations_2018_df = april_2018_df[["end station id", "end station name"]].copy()

# Organize and calculate total number of etarts for each unique station
end_stations_2018_df = end_stations_2018_df.groupby("end station name").count()
end_stations_2018_df = end_stations_2018_df.sort_values(by = "end station id", ascending = False)

# Rename column Number of Ends column and reset index
end_stations_2018_df = end_stations_2018_df.rename(columns = {"end station id": "Number of Ends 2018"})
end_stations_2018_df = end_stations_2018_df.reset_index()

In [15]:
# Rename Station column
end_stations_2018_df = end_stations_2018_df.rename(columns = {"end station name": "Station Name"})

# Calculate Percentage of Total Ends 2018
end_stations_2018_df["Percentage of Total Ends 2018"] = round(end_stations_2018_df["Number of Ends 2018"]/end_stations_2018_df["Station Name"].count(),2)
end_stations_2018_df.head()

Unnamed: 0,Station Name,Number of Ends 2018,Percentage of Total Ends 2018
0,Grove St PATH,4577,73.82
1,Exchange Place,2006,32.35
2,Hamilton Park,1663,26.82
3,Sip Ave,1435,23.15
4,Harborside,1381,22.27


In [16]:
# Create End Stations 2019 dataframe
end_stations_2019_df = april_2019_df[["end station id", "end station name"]].copy()

# Organize and calculate total number of etarts for each unique station
end_stations_2019_df = end_stations_2019_df.groupby("end station name").count()
end_stations_2019_df = end_stations_2019_df.sort_values(by = "end station id", ascending = False)

# Rename column Number of Ends column and reset index
end_stations_2019_df = end_stations_2019_df.rename(columns = {"end station id": "Number of Ends 2019"})
end_stations_2019_df = end_stations_2019_df.reset_index()

In [17]:
# Rename Station column
end_stations_2019_df = end_stations_2019_df.rename(columns = {"end station name": "Station Name"})

# Calculate Percentage of Total Ends 2019
end_stations_2019_df["Percentage of Total Ends 2019"] = round(end_stations_2019_df["Number of Ends 2019"]/end_stations_2019_df["Station Name"].count(),2)
end_stations_2019_df.head()

Unnamed: 0,Station Name,Number of Ends 2019,Percentage of Total Ends 2019
0,Grove St PATH,3389,52.95
1,Exchange Place,1484,23.19
2,Hamilton Park,1438,22.47
3,Sip Ave,1108,17.31
4,Newport PATH,1098,17.16


## Determine Start and End Stations count/percentages in 2018 and 2019

In [18]:
# Confirm how many 2018 start stations there are
total_start_stations_2018 = start_stations_2018_df["Station Name"].nunique()
total_start_stations_2018

51

In [19]:
# Confirm how many 2018 end stations there are
total_end_stations_2018 = end_stations_2018_df["Station Name"].nunique()
total_end_stations_2018

62

In [20]:
# Confirm how many 2019 start stations there are
total_start_stations_2019 = start_stations_2019_df["Station Name"].nunique()
total_start_stations_2019

58

In [21]:
# Confirm how many 2019 end stations there are
total_end_stations_2019 = end_stations_2019_df["Station Name"].nunique()
total_end_stations_2019

64

In [22]:
start_station_growth_percent = round((total_start_stations_2019 - total_start_stations_2018)/(total_start_stations_2019)*100, 2)
start_station_growth_percent

12.07

In [23]:
end_station_growth_percent = round((total_end_stations_2019 - total_end_stations_2018)/(total_end_stations_2019)*100, 2)
end_station_growth_percent

3.12

## Merge 2018 Start and End Dataframes

In [24]:
complete_2018_stations_df = pd.merge(start_stations_2018_df, end_stations_2018_df, how = "right",
                                    on = ["Station Name"])

# Confirm the total amount of rows equals the total number of end stations
complete_2018_stations_df.shape

(62, 5)

In [25]:
complete_2018_stations_df.head()

Unnamed: 0,Station Name,Number of Starts 2018,Percentage of Total Starts 2018,Number of Ends 2018,Percentage of Total Ends 2018
0,Grove St PATH,3728.0,73.1,4577,73.82
1,Hamilton Park,1754.0,34.39,1663,26.82
2,Exchange Place,1571.0,30.8,2006,32.35
3,Sip Ave,1509.0,29.59,1435,23.15
4,Newport PATH,1458.0,28.59,1334,21.52


## Merge 2019 Start and End Dataframes

In [26]:
complete_2019_stations_df = pd.merge(start_stations_2019_df, end_stations_2019_df, how = "right",
                                    on = ["Station Name"])

# Confirm the total amount of rows equals the total number of end stations
complete_2019_stations_df.shape

(64, 5)

In [27]:
complete_2019_stations_df.head()

Unnamed: 0,Station Name,Number of Starts 2019,Percentage of Total Starts 2019,Number of Ends 2019,Percentage of Total Ends 2019
0,Grove St PATH,2876.0,49.59,3389,52.95
1,Hamilton Park,1424.0,24.55,1438,22.47
2,Exchange Place,1276.0,22.0,1484,23.19
3,Sip Ave,1235.0,21.29,1108,17.31
4,Newport PATH,1116.0,19.24,1098,17.16


## Calculate Number of Bikes and Percentage of Growth

In [28]:
total_bikes_2018 = april_2018_df["bikeid"].nunique()
total_bikes_2018

493

In [29]:
total_bikes_2019 = april_2019_df["bikeid"].nunique()
total_bikes_2019

554

In [30]:
bike_growth_percent = round((total_bikes_2019 - total_bikes_2018)/(total_bikes_2019)*100, 2)
bike_growth_percent

11.01

## Calculate Total Number of Trips in 2018 vs. 2019

In [34]:
total_trips_2018 = april_2018_df["tripduration"].count()
total_trips_2018

33056

In [35]:
total_trips_2019 = april_2019_df["tripduration"].count()
total_trips_2019

23634

In [37]:
trip_growth_percentage = round((total_trips_2019 - total_trips_2018)/(total_trips_2019)*100,2)
trip_growth_percentage

-39.87

## Calculate Length of Trips in 2018 vs. 2019

In [40]:
average_trip_length_2018 = april_2018_df["tripduration"].sum()/total_trips_2018
average_trip_length_2018

713.9956740077445

## Change Column Names

In [None]:
# Print column names
list(april_2018_df.columns.values) 

# Create list of new column names
new_column_names = ["Trip Duration in Seconds", 
                    "Start Time",
                   "Stop Time",
                   "Start Station ID",
                    "Start Station Name",
                   "Start Station Latitude",
                   "Start Station Longitude",
                    "Stop Station ID",
                    "Stop Station Name",
                   "Stop Station Latitude",
                   "Stop Station Longitude",
                    "Bike ID",
                    "User Type",
                    "Birth Year",
                    "Gender"]
# Set both dataframes with new column names
april_2018_df.columns= [new_column_names]
april_2019_df.columns= [new_column_names]