In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# list of files to combine
citibike_data = ["Resources/202312-citibike-tripdata.csv", "Resources/202306-citibike-tripdata.csv"] 

# Empty list to store dataframes
df_citibike_data = []

# For to read the list of Files and store into Pandas DataFrames
for file in citibike_data:
    df = pd.read_csv(file, parse_dates=['started_at', 'ended_at'], 
                              dtype={'start_station_id': str, 'end_station_id': str}) 
    df_citibike_data.append(df)

# Combine the data into a single dataset.  
df_citibike = pd.concat(df_citibike_data, axis=0)

# Show dataframe
df_citibike.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,FB18F431791D6F97,classic_bike,2023-12-07 12:40:22,2023-12-07 12:47:09,Allen St & Stanton St,5484.09,Carmine St & 6 Ave,5763.03,40.721818,-73.989172,40.730386,-74.00215,member
1,73DF56B794079C50,classic_bike,2023-12-29 13:47:27,2023-12-29 13:54:02,Carlton Ave & Dean St,4199.12,Union St & 4 Ave,4175.15,40.680974,-73.97101,40.677274,-73.98282,member
2,E3BA5AF851CC1CF0,classic_bike,2023-12-14 19:57:46,2023-12-14 20:15:12,W 84 St & Amsterdam Ave,7409.04,W 48 St & Rockefeller Plaza,6626.11,40.786249,-73.975446,40.757769,-73.979294,member
3,8F2CBCCB503B0398,electric_bike,2023-12-20 16:55:15,2023-12-20 17:04:03,E 85 St & York Ave,7146.04,Central Park West & W 85 St,7354.01,40.775369,-73.948034,40.78476,-73.969862,member
4,A28FFC9585DE8CC5,classic_bike,2023-12-30 14:43:15,2023-12-30 14:56:33,W 84 St & Amsterdam Ave,7409.04,E 58 St & 3 Ave,6762.02,40.786238,-73.975478,40.760958,-73.967245,member


In [3]:
df_citibike.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

In [4]:
# Convert Start and End times to datetime 
df_citibike['started_at'] = pd.to_datetime(df_citibike['started_at'])
df_citibike['ended_at'] = pd.to_datetime(df_citibike['ended_at'])

# Calculate trip duration in minutes
df_citibike['duration_min'] = (df_citibike['ended_at'] - df_citibike['started_at']).dt.total_seconds() / 60

# Look for obvious outliers or false data (less than 2 minutes or more than 24 hours)
df_citibike = df_citibike[(df_citibike['duration_min'] > 2) & (df_citibike['duration_min'] < 1440)]

# Drop rows with any missing values
df_citibike.dropna(inplace=True)

# Reset index 
df_citibike.reset_index(drop=True, inplace=True)

In [5]:
# Show dataframe
df_citibike.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration_min
0,FB18F431791D6F97,classic_bike,2023-12-07 12:40:22,2023-12-07 12:47:09,Allen St & Stanton St,5484.09,Carmine St & 6 Ave,5763.03,40.721818,-73.989172,40.730386,-74.00215,member,6.783333
1,73DF56B794079C50,classic_bike,2023-12-29 13:47:27,2023-12-29 13:54:02,Carlton Ave & Dean St,4199.12,Union St & 4 Ave,4175.15,40.680974,-73.97101,40.677274,-73.98282,member,6.583333
2,E3BA5AF851CC1CF0,classic_bike,2023-12-14 19:57:46,2023-12-14 20:15:12,W 84 St & Amsterdam Ave,7409.04,W 48 St & Rockefeller Plaza,6626.11,40.786249,-73.975446,40.757769,-73.979294,member,17.433333
3,8F2CBCCB503B0398,electric_bike,2023-12-20 16:55:15,2023-12-20 17:04:03,E 85 St & York Ave,7146.04,Central Park West & W 85 St,7354.01,40.775369,-73.948034,40.78476,-73.969862,member,8.8
4,A28FFC9585DE8CC5,classic_bike,2023-12-30 14:43:15,2023-12-30 14:56:33,W 84 St & Amsterdam Ave,7409.04,E 58 St & 3 Ave,6762.02,40.786238,-73.975478,40.760958,-73.967245,member,13.3


In [6]:
# Check data types
df_citibike.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
duration_min                 float64
dtype: object

In [7]:
df_citibike.to_csv('Resources/citibike_data.csv', index=False)