In [40]:
# Dependencies and Setup
import pandas as pd
import glob

In [41]:
# List all Citi Bike CSV files in the working directory
citi_bike_csv_files = [file for file in glob.glob('Resources/*tripdata.csv')]
citi_bike_csv_files

['Resources/JC-201902-citibike-tripdata.csv',
 'Resources/JC-201908-citibike-tripdata.csv',
 'Resources/JC-202002-citibike-tripdata.csv',
 'Resources/JC-201910-citibike-tripdata.csv',
 'Resources/JC-201906-citibike-tripdata.csv',
 'Resources/JC-201912-citibike-tripdata.csv',
 'Resources/JC-201904-citibike-tripdata.csv',
 'Resources/JC-201903-citibike-tripdata.csv',
 'Resources/JC-201909-citibike-tripdata.csv',
 'Resources/JC-201901-citibike-tripdata.csv',
 'Resources/JC-202003-citibike-tripdata.csv',
 'Resources/JC-201907-citibike-tripdata.csv',
 'Resources/JC-201911-citibike-tripdata.csv',
 'Resources/JC-201905-citibike-tripdata.csv',
 'Resources/JC-202001-citibike-tripdata.csv']

In [42]:
# Read and consolidate all CSV files into one Data Frame
all_citi_bike_data = pd.concat([pd.read_csv(file) for file in citi_bike_csv_files])
all_citi_bike_data

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,142,2019-02-01 15:35:02.0820,2019-02-01 15:37:24.1360,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,-74.034234,29677,Subscriber,1963,1
1,223,2019-02-01 17:00:46.8900,2019-02-01 17:04:30.5500,3183,Exchange Place,40.716247,-74.033459,3681,Grand St,40.715178,-74.037683,26234,Subscriber,1992,2
2,106,2019-02-01 17:08:01.3260,2019-02-01 17:09:47.4400,3183,Exchange Place,40.716247,-74.033459,3184,Paulus Hook,40.714145,-74.033552,29588,Subscriber,1960,1
3,370,2019-02-01 17:09:31.2100,2019-02-01 17:15:41.6550,3183,Exchange Place,40.716247,-74.033459,3211,Newark Ave,40.721525,-74.046305,29250,Subscriber,1976,1
4,315,2019-02-01 17:19:53.2490,2019-02-01 17:25:09.1400,3183,Exchange Place,40.716247,-74.033459,3273,Manila & 1st,40.721651,-74.042884,29586,Subscriber,1980,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26015,544,2020-01-31 23:29:29.3910,2020-01-31 23:38:33.6910,3213,Van Vorst Park,40.718489,-74.047727,3194,McGinley Square,40.725340,-74.067622,29659,Subscriber,1989,1
26016,122,2020-01-31 23:30:59.3670,2020-01-31 23:33:01.6870,3792,Columbus Dr at Exchange Pl,40.716870,-74.032810,3639,Harborside,40.719252,-74.034234,42361,Subscriber,1991,1
26017,201,2020-01-31 23:42:34.8460,2020-01-31 23:45:55.8780,3273,Manila & 1st,40.721651,-74.042884,3209,Brunswick St,40.724176,-74.050656,42368,Subscriber,1988,1
26018,300,2020-01-31 23:45:00.6800,2020-01-31 23:50:00.8740,3185,City Hall,40.717733,-74.043845,3267,Morris Canal,40.712419,-74.038526,42257,Subscriber,1981,2


In [43]:
# Drop any rows with at least one null value
all_citi_bike_data_clean = all_citi_bike_data.dropna()
all_citi_bike_data_clean

print("Old data frame length:", len(all_citi_bike_data)) 
print("New data frame length:", len(all_citi_bike_data_clean)) 
print("Number of rows with at least 1 NA value: ", (len(all_citi_bike_data)-len(all_citi_bike_data_clean))) 

Old data frame length: 471648
New data frame length: 471648
Number of rows with at least 1 NA value:  0


In [44]:
# Verify data types
all_citi_bike_data_clean.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

In [45]:
# Verify gender data (0=unknown; 1=male; 2=female)
all_citi_bike_data_clean.gender.value_counts()

1    333427
2    108178
0     30043
Name: gender, dtype: int64

In [46]:
# Change data type of gender column to String
all_citi_bike_data_clean['gender'] = all_citi_bike_data_clean['gender'].astype(str)

# Replace gender values (0=>unknown; 1=>male; 2=>female)
all_citi_bike_data_clean['gender'].replace({'0':'Unknown', '1': 'Male', '2': 'Female'}, inplace=True)

# Verify gender data after replacement
all_citi_bike_data_clean.gender.value_counts()

Male       333427
Female     108178
Unknown     30043
Name: gender, dtype: int64

In [47]:
# Verify user type data
all_citi_bike_data_clean.usertype.value_counts()

Subscriber    423002
Customer       48646
Name: usertype, dtype: int64

In [48]:
# Check for any duplicate rows in the DataFra,e except first occurrence based on all columns
duplicate_rows_df = all_citi_bike_data_clean[all_citi_bike_data_clean.duplicated()]
 
print(f"Duplicate Rows except first occurrence based on all columns are: {len(duplicate_rows_df)}")
print()

Duplicate Rows except first occurrence based on all columns are: 0



In [51]:
all_citi_bike_data_clean["start station name"].value_counts()

Grove St PATH                 53425
Hamilton Park                 27545
Sip Ave                       22868
Harborside                    20884
Newport PATH                  20036
Marin Light Rail              16586
Newport Pkwy                  13112
Newark Ave                    12424
City Hall                     12124
Columbus Dr at Exchange Pl    11734
Warren St                     11569
Morris Canal                  11535
Brunswick & 6th               11520
Washington St                 11270
Jersey & 3rd                  10680
Paulus Hook                   10442
Jersey & 6th St               10414
Liberty Light Rail            10119
Columbus Drive                10036
Van Vorst Park                10016
Brunswick St                   9864
Monmouth and 6th               9603
Manila & 1st                   8889
Journal Square                 8679
McGinley Square                8628
Dixon Mills                    7966
JC Medical Center              7891
Essex Light Rail            

In [26]:
all_citi_bike_data_clean["start station name"].counts()

AttributeError: 'Series' object has no attribute 'counts'

In [50]:
# Convert the consolidated DataFrame into a csv file and export
all_citi_bike_data_clean.to_csv("Resources/citibike-tripdata-consolidated-201901-202003.csv", index=False, encoding="utf-8")