## Setup

In [1]:
# imports
import pandas as pd
import glob
import os

In [2]:
# find the csv files
files_list = glob.glob(os.path.join('Resources', '*.csv'))

# combine them
dfs = []

for file in files_list:
    df = pd.read_csv(file)
    dfs.append(df)

citibike_df = pd.concat(dfs, ignore_index=True)

# display df
citibike_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member


## Data Check

#### Update data types

In [4]:
# check data types
citibike_df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [5]:
# check for amount of nulls 
citibike_df.isnull().sum()

ride_id                  0
rideable_type            0
started_at               0
ended_at                 0
start_station_name     104
start_station_id       104
end_station_name      3061
end_station_id        3338
start_lat                0
start_lng                0
end_lat                353
end_lng                353
member_casual            0
dtype: int64

In [None]:
# update dates to datetime
date_format='mixed'
citibike_df['started_at'] = pd.to_datetime(citibike_df['started_at'], format=date_format)
citibike_df['ended_at'] = pd.to_datetime(citibike_df['ended_at'], format=date_format)

In [9]:
# check for datetime
citibike_df.select_dtypes('datetime64[ns]')

Unnamed: 0,started_at,ended_at
0,2024-01-15 15:18:07.000,2024-01-15 15:32:44.000
1,2024-01-13 15:32:50.000,2024-01-13 15:36:18.000
2,2024-01-19 13:11:00.000,2024-01-19 13:14:44.000
3,2024-01-23 07:03:49.000,2024-01-23 07:07:11.000
4,2024-01-01 16:46:10.000,2024-01-01 16:50:31.000
...,...,...
1052446,2024-12-28 09:45:30.704,2024-12-28 09:48:02.706
1052447,2024-12-12 16:21:50.427,2024-12-12 16:26:34.069
1052448,2024-12-11 19:23:24.109,2024-12-11 19:25:07.612
1052449,2024-12-12 20:48:40.471,2024-12-12 20:52:41.722


#### Check unique values

In [3]:
citibike_df.shape

(1052451, 13)

In [13]:
# check number of unique values per column
citibike_df.nunique()

ride_id               1052427
rideable_type               2
started_at            1043629
ended_at              1044175
start_station_name        205
start_station_id          205
end_station_name          519
end_station_id            520
start_lat              104008
start_lng              110731
end_lat                   541
end_lng                   547
member_casual               2
dtype: int64

##### Look into why Ride Id isn't unique --> counting rides from one month to the next (unnecessary duplicates)

In [14]:
# get df of duplicated ride_ids
dupes_df = citibike_df[citibike_df.duplicated(subset=['ride_id'], keep=False)]

# sort so duplicate groups stay together
dupes_df = dupes_df.sort_values(by='ride_id')

# display
dupes_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
420821,09D67B6866E802DA,classic_bike,2024-05-31 23:54:11.503,2024-06-01 00:19:31.421,Willow Ave & 12 St,HB505,Willow Ave & 12 St,HB505,40.751867,-74.030377,40.751867,-74.030377,casual
342531,09D67B6866E802DA,classic_bike,2024-05-31 23:54:11.000,2024-06-01 00:19:31.000,Willow Ave & 12 St,HB505,Willow Ave & 12 St,HB505,40.751867,-74.030377,40.751867,-74.030377,casual
360447,15413FAB9CC9F156,electric_bike,2024-05-31 23:51:10.638,2024-06-01 00:02:52.559,Hilltop,JC019,York St & Marin Blvd,JC097,40.731119,-74.057494,40.716615,-74.042412,casual
283209,15413FAB9CC9F156,electric_bike,2024-05-31 23:51:10.000,2024-06-01 00:02:52.000,Hilltop,JC019,York St & Marin Blvd,JC097,40.731119,-74.057494,40.716615,-74.042412,casual
256909,1AB4E466DCCC4147,classic_bike,2024-05-31 23:38:55.000,2024-06-01 05:20:10.000,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Hoboken Ave at Monmouth St,JC105,40.736982,-74.027781,40.735208,-74.046964,casual
414293,1AB4E466DCCC4147,classic_bike,2024-05-31 23:38:55.590,2024-06-01 05:20:10.327,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Hoboken Ave at Monmouth St,JC105,40.736982,-74.027781,40.735208,-74.046964,casual
363069,269CAE1A61663378,classic_bike,2024-05-31 23:55:31.802,2024-06-01 00:00:50.426,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.73879,-74.0393,member
273924,269CAE1A61663378,classic_bike,2024-05-31 23:55:31.000,2024-06-01 00:00:50.000,Hoboken Terminal - River St & Hudson Pl,HB102,Madison St & 1 St,HB402,40.736068,-74.029127,40.73879,-74.0393,member
395980,2FBED42B501D1159,electric_bike,2024-05-31 23:57:51.452,2024-06-01 00:08:06.554,Adams St & 12 St,HB610,City Hall - Washington St & 1 St,HB105,40.751833,-74.033343,40.73736,-74.03097,casual
305232,2FBED42B501D1159,electric_bike,2024-05-31 23:57:51.000,2024-06-01 00:08:06.000,Adams St & 12 St,HB610,City Hall - Washington St & 1 St,HB105,40.751833,-74.033343,40.73736,-74.03097,casual


In [15]:
# drop duplicate rows
citibike_df = citibike_df.drop_duplicates(subset=['ride_id'], keep='first')

# show number of unique ride id rows versus count
print(f'Total: {citibike_df['ride_id'].count()}')
print(f'Unique: {citibike_df['ride_id'].nunique()}')

Total: 1052427
Unique: 1052427
