In [20]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon | rides

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location
- rides is a count of the number of rides found in the data -- it is used to remove dummy stations in the data.
    only stations with more than RIDES_COUNT_THRESHOLD are included in output

"""
from datetime import datetime
import math
import os

import pandas as pd
from zipfile import ZipFile


RIDES_COUNT_THRESHOLD = 100


def get_filepath(city):
    return '../data/' + city + '-bike/'


def transform_date(date):
    try:
        dt = datetime.strptime(date.split(' ')[0], '%m/%d/%Y')
    except ValueError:
        # this dataset is so frustrating lol
        dt = datetime.strptime(date.split(' ')[0], '%Y-%m-%d')
        
    return dt.strftime('%Y-%m-%d')

In [23]:

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'

# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'
RIDES = 'rides'


"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""



def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: [],
        RIDES: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
        new_dict[RIDES].append(station_dict[RIDES])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()

directory = get_filepath('nyc')

files_count = 0
for zipfilename in os.listdir(directory):
    if not zipfilename.endswith(".zip"):
        continue

    files_count += 1 
    print(files_count, ': handling file', zipfilename)
    
    fullzipfilename = directory + zipfilename
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(fullzipfilename)
    csvfilename = [f.filename for f in zipfile.infolist() if f.filename.endswith('.csv')][0]
    
    stations_df = pd.read_csv(zipfile.open(csvfilename))
    # Because someone can't make data files with uniform column names
    stations_df.columns = map(str.lower, stations_df.columns)
    stations_df.columns = stations_df.columns.str.replace('[\ ]', '')
    # transform the dates
    stations_df[starttime] = stations_df[starttime].apply(transform_date)

    unique_station_ids = stations_df[start_station_id].unique()
    for station_id in unique_station_ids:
        if math.isnan(station_id):
            print('found nan')
            continue
        station_df = stations_df[stations_df[start_station_id] == station_id]
        
        if station_id not in stations_dict:
            stations_dict[station_id] = {
                NAME: station_df[start_station_name].iloc[0], 
                LAT: station_df[start_station_latitude].iloc[0],
                LON: station_df[start_station_longitude].iloc[0], 
                FIRST: station_df[starttime].iloc[0], 
                LAST: station_df[starttime].iloc[0],
                RIDES: 0,
            }
        rides_count = len(station_df.index)
        stations_dict[station_id][RIDES] += rides_count
        station_df = station_df.sort_values(by=[starttime])
        if (station_df[starttime].iloc[0] < stations_dict[station_id][FIRST]):
            stations_dict[station_id][FIRST] = stations_df[starttime].iloc[0]
        if (station_df[starttime].iloc[-1] > stations_dict[station_id][LAST]):
            stations_dict[station_id][LAST] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

1 : handling file 201309-citibike-tripdata.zip
2 : handling file 201608-citibike-tripdata.zip
3 : handling file 201807-citibike-tripdata.csv.zip
4 : handling file 201708-citibike-tripdata.csv.zip
5 : handling file 201905-citibike-tripdata.csv.zip
6 : handling file 201510-citibike-tripdata.zip
7 : handling file 201405-citibike-tripdata.zip
8 : handling file 201812-citibike-tripdata.csv.zip
found nan
9 : handling file 201506-citibike-tripdata.zip
10 : handling file 201712-citibike-tripdata.csv.zip
11 : handling file 201902-citibike-tripdata.csv.zip
found nan
12 : handling file 201512-citibike-tripdata.zip
13 : handling file 201411-citibike-tripdata.zip
14 : handling file 201808-citibike-tripdata.csv.zip
found nan
15 : handling file 201407-citibike-tripdata.zip
16 : handling file 201504-citibike-tripdata.zip
17 : handling file 201707-citibike-tripdata.csv.zip
18 : handling file 201602-citibike-tripdata.zip
19 : handling file 201702-citibike-tripdata.csv.zip
20 : handling file 201612-citib

Unnamed: 0,id,name,lat,lon,first,last,rides
0,254.0,W 11 St & 6 Ave,40.735324,-73.998004,2013-06-01,2019-07-31,168076
1,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,2013-06-01,2019-07-31,445639
2,352.0,W 56 St & 6 Ave,40.763406,-73.977225,2013-06-01,2016-12-31,122203
3,490.0,8 Ave & W 33 St,40.751551,-73.993934,2013-06-01,2019-07-31,480463
4,236.0,St Marks Pl & 2 Ave,40.728419,-73.98714,2013-06-01,2019-07-31,292217


In [24]:
# Transform the stations_df

# Remove dummy stations (there are test stations in the data)
# Remove stations with less than RIDES_COUNT_THRESHOLD rides
bad_stations_df = stations_df[stations_df[RIDES] < RIDES_COUNT_THRESHOLD]
print('removing %d bad stations that each have less than %d rides from stations data' % (bad_stations_df.shape[0], RIDES_COUNT_THRESHOLD))
stations_df = stations_df[stations_df[RIDES] >= RIDES_COUNT_THRESHOLD]

removing 22 bad stations that each have less than 100 rides from stations data


In [26]:
bad_stations_df.head(10)

Unnamed: 0,id,name,lat,lon,first,last,rides
499,3250.0,NYCBS Depot - PIT,40.71691,-73.983838,2016-07-01,2019-03-31,48
532,3385.0,2 Ave & E 105 St,40.789817,-73.942961,2016-08-12,2016-08-12,1
587,3040.0,SSP Tech Workshop,40.646678,-74.016263,2016-07-01,2019-05-31,49
597,3036.0,8D OPS 01,40.662908,-73.999722,2016-07-01,2018-07-18,28
675,3485.0,NYCBS Depot - RIS,40.725208,-73.974724,2017-08-01,2018-07-31,45
839,3683.0,Smart Cities Conference,40.710155,-73.984535,2018-05-01,2018-07-03,91
856,3470.0,Expansion Tech Station,40.669802,-73.994905,2017-06-01,2017-08-18,9
857,3488.0,8D QC Station 01,45.506364,-73.569463,2017-08-22,2018-06-30,19
932,3685.0,Prospect Park - 5 Year Anniversary Celebration,40.660652,-73.96459,2018-05-27,2018-05-27,72
941,3215.0,JSQ Don't Use,0.0,0.0,2017-10-01,2017-10-31,8


In [27]:
# Save the data to CSV
save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)
print('wrote data to ', save_to_csvfilename)

In [30]:
# Save the data to JSON that will be used in web app
import json

stations = []
for index, row in stations_df.iterrows():
    # Transform the date
    date = row[5]
    
    stations.append({
        ID: int(row[ID]),
        NAME: row[NAME],
        LAT: row[LAT],
        LON: row[LON],
        FIRST: transform_date(row[FIRST]),
        LAST: transform_date(row[LAST]),
    })

json = json.dumps(stations)

save_to_jsonfilename = directory + 'stations.json'
with open(save_to_jsonfilename, 'w') as f:
    f.write(json)
print("Data written to stations.json")

Data written to stations.json


In [31]:
stations_df.head()

Unnamed: 0,id,name,lat,lon,first,last,rides
0,254.0,W 11 St & 6 Ave,40.735324,-73.998004,2013-06-01,2019-07-31,168076
1,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,2013-06-01,2019-07-31,445639
2,352.0,W 56 St & 6 Ave,40.763406,-73.977225,2013-06-01,2016-12-31,122203
3,490.0,8 Ave & W 33 St,40.751551,-73.993934,2013-06-01,2019-07-31,480463
4,236.0,St Marks Pl & 2 Ave,40.728419,-73.98714,2013-06-01,2019-07-31,292217
