In [58]:
import pandas as pd 

#Add File Prefix for Blue Bike Trip Data
prefix = '../Data/BlueBikesData/'

files = [
    '202304-bluebikes-tripdata.csv',
    '202307-bluebikes-tripdata.csv',
    '202310-bluebikes-tripdata.csv',
    '202312-bluebikes-tripdata.csv'
]

# Read each file and store them in a list
dataframes = [pd.read_csv(prefix + file) for file in files]

# Add season column for each dataframe
dataframes[0]['season'] = 'spring'
dataframes[1]['season'] = 'summer'
dataframes[2]['season'] = 'fall'
dataframes[3]['season'] = 'winter'

In [59]:
# Concatenate dataframes
trip_data_2023 = pd.concat(dataframes, ignore_index=True, axis = 0)

In [60]:
trip_data_2023.shape

(1313047, 14)

In [61]:
# Check null values
trip_data_2023.isnull().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name       8
start_station_id         8
end_station_name      7343
end_station_id        7343
start_lat                0
start_lng                0
end_lat               7264
end_lng               7264
member_casual            0
season                   0
dtype: int64

In [62]:
# Drop null rows (only a handful)
trip_data_2023 = trip_data_2023.dropna()

In [63]:
import numpy as np

# Haversine function to compute distance from start to end station
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance in miles between two points 
    on the earth (specified in decimal degrees)
    """
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 3956 
    return c * r

In [65]:
trip_data_2023

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,season
0,0093AA5E7E3E0158,docked_bike,2023-04-13 13:49:59,2023-04-13 13:55:04,Innovation Lab - 125 Western Ave at Batten Way,A32011,Soldiers Field Park - 111 Western Ave,A32006,42.363713,-71.124598,42.364263,-71.118276,member,spring
1,BFA8B88E063688F4,docked_bike,2023-04-25 09:44:38,2023-04-25 09:51:28,Museum of Science,M32045,One Broadway / Kendall Sq at Main St / 3rd St,M32003,42.367690,-71.071163,42.362242,-71.083111,member,spring
2,A9C51FA200C31A81,docked_bike,2023-04-24 18:39:31,2023-04-24 18:58:05,New Balance - 20 Guest St,D32001,HMS/HSPH - Avenue Louis Pasteur at Longwood Ave,B32003,42.357329,-71.146735,42.337417,-71.102861,casual,spring
3,0C1D451797FF0871,docked_bike,2023-04-04 19:25:31,2023-04-04 19:32:14,Museum of Science,M32045,Gore Street at Lambert Street,M32081,42.367690,-71.071163,42.373080,-71.086342,member,spring
4,DDDCD0A2D2EE7A37,docked_bike,2023-04-11 08:36:14,2023-04-11 08:52:39,Museum of Science,M32045,Columbus Ave at W. Canton St,C32077,42.367690,-71.071163,42.344742,-71.076482,member,spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1313042,4079F85781F2FB07,classic_bike,2023-12-20 08:22:55,2023-12-20 08:41:17,Lesley University,M32039,EF - North Point Park,M32034,42.386748,-71.119019,42.369885,-71.069957,member,winter
1313043,A2411C8176C88993,classic_bike,2023-12-13 08:22:31,2023-12-13 08:39:35,Lesley University,M32039,EF - North Point Park,M32034,42.386748,-71.119019,42.369885,-71.069957,member,winter
1313044,C3D217A504FC1222,classic_bike,2023-12-05 22:10:12,2023-12-05 22:35:39,Congress St at Boston City Hall,D32009,MIT Vassar St,M32042,42.360417,-71.057522,42.355601,-71.103945,casual,winter
1313045,105A4D5A6FB5CE19,classic_bike,2023-12-23 12:16:56,2023-12-23 12:41:35,Lesley University,M32039,MIT Vassar St,M32042,42.386748,-71.119019,42.355601,-71.103945,member,winter


In [66]:
# Compute distance of each bike trip
trip_data_2023['trip_distance'] = trip_data_2023.apply(lambda row: haversine(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']), axis=1)

In [67]:
trip_data_2023.describe()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,trip_distance
count,1305701.0,1305701.0,1305701.0,1305701.0,1305701.0
mean,42.3594,-71.09001,42.35933,-71.08972,1.236565
std,0.02066682,0.02952048,0.02068335,0.02957417,0.8813457
min,42.16723,-71.24776,42.16723,-71.24776,0.0
25%,42.34871,-71.10734,42.34871,-71.10729,0.6231077
50%,42.35957,-71.09116,42.35957,-71.09051,1.025209
75%,42.36895,-71.07119,42.36895,-71.07119,1.653843
max,42.53467,-70.87021,42.53467,-70.87021,15.62984


In [68]:
trip_data_2023.isnull().sum()

ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
season                0
trip_distance         0
dtype: int64

In [71]:
trip_data_2023.to_csv("../Data/BlueBikesData/trips_data_2023.csv")