In [1]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon | rides

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location
- rides is a count of the number of rides found in the data -- it is used to remove dummy stations in the data.
    only stations with more than RIDES_COUNT_THRESHOLD are included in output

This script is abstracte to apply to multiple cities.
DON'T FORGET: update the 'CITY' variable

"""
from datetime import datetime
import math
import os

import pandas as pd
from zipfile import ZipFile

CITY = 'nyc'
# CITY = 'nyc'
print('city', CITY)


RIDES_COUNT_THRESHOLD = 100


def get_filepath(city):
    return '../data/' + city + '-bike/'


def transform_date(date):
    try:
        dt = datetime.strptime(date.split(' ')[0], '%m/%d/%Y')
    except ValueError:
        # this dataset is so frustrating lol
        dt = datetime.strptime(date.split(' ')[0], '%Y-%m-%d')
        
    return dt.strftime('%Y-%m-%d')

def open_zipfile(zipfilename):
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(zipfilename)
    filenames = [f.filename for f in zipfile.infolist()]
    # Return the first file that can be opened  - not all of them have .csv suffix
    for filename in filenames:
        try:
            df = pd.read_csv(zipfile.open(filename))
            if df.empty:
                print(filename,'has an empty file')
                continue
            else:
                return df
        except:
            print('failed to open filename from zip', zipfilename, ': ', filename)
            pass
    raise Exception('unable to read a csv from zipfile %s' % zipfilename)


city nyc


In [2]:
"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'


    
# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'
RIDES = 'rides'


In [3]:

def preprocess_stations_df(df):
    # Because someone can't make data files with uniform column names
    df.columns = df.columns.str.replace('number', 'id')  # 'Station Number' vs Station ID
    df.columns = df.columns.str.replace('date', 'time')  # 'Start Date' vs 'Start Time'
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('[\ ]', '')
    # transform the dates
    df[starttime] = df[starttime].apply(transform_date)
    if CITY == "boston":
        df = preprocess_boston_stations_df(df)
    return df


# Some of the earlier bostons stations data does not include lat,lon coordinates.
# These files contains the lat,lon coordinates (and other data) for station IDs
hubway_stations_locations_filenames = [
    "Hubway_Stations_as_of_July_2017.csv",
    "previous_Hubway_Stations_as_of_July_2017.csv"
]

def get_hubway_stations_locations_df():
    df = pd.DataFrame()
    filenames = [get_filepath(CITY) + fname for fname in hubway_stations_locations_filenames]
    for filename in filenames:
        new_df = pd.read_csv(filename)    
        hubway_stations_locations_column_names = {
            "Station ID": start_station_id,
            "Latitude": start_station_latitude,
            "Longitude": start_station_longitude,
        }
        # Rename the column names to match the rides data that the locations data will be joined with
        new_df.rename(columns=hubway_stations_locations_column_names, inplace=True)
        df = new_df if df.empty else df.append(new_df)
    df.drop_duplicates(subset=[start_station_id], inplace=True)
    return df

hubway_stations_locations_df = None
if CITY == "boston":
    hubway_stations_locations_df = get_hubway_stations_locations_df()


def preprocess_boston_stations_df(df):
    if start_station_latitude in df.columns:
        return df
    # Otherwise this is one of the datasets that is lacking lat, lon info.
    # Add the lat,lon info
    return hubway_stations_locations_df.merge(df, on=start_station_id)


#print('hubwaystations shape', hubway_stations_locations_df.shape)
#hubway_stations_locations_df.head()

In [4]:
FILENAMES_TO_IGNORE = ['stations.csv'] + ['stations.json'] # + hubway_stations_locations_filenames + [more bad filenames here]


def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: [],
        RIDES: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
        new_dict[RIDES].append(station_dict[RIDES])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()

directory = get_filepath(CITY)
files_count = 0

for filename in os.listdir(directory):
    if filename in FILENAMES_TO_IGNORE:
        continue
        
    fullfilename = directory + filename
    print(files_count, ': handling file', filename)
    files_count+=1
    
    if filename.endswith(".csv"):
        stations_df = pd.read_csv(fullfilename)
    elif filename.endswith(".zip"):
        stations_df = open_zipfile(fullfilename)
    else:
        continue
    
    stations_df = preprocess_stations_df(stations_df)

    unique_station_ids = stations_df[start_station_id].unique()
    for station_id in unique_station_ids:
        station_df = stations_df[stations_df[start_station_id] == station_id]
        
        if station_id not in stations_dict:
            try:
                stations_dict[station_id] = {
                    NAME: station_df[start_station_name].iloc[0], 
                    LAT: station_df[start_station_latitude].iloc[0],
                    LON: station_df[start_station_longitude].iloc[0], 
                    FIRST: station_df[starttime].iloc[0], 
                    LAST: station_df[starttime].iloc[0],
                    RIDES: 0,
                }
            except IndexError as e:
                #print('caught index error', e)
                #print('station id', station_id)
                print ('in',filename,'caught index error',e)
                #print(station_df.head())
                continue
        rides_count = len(station_df.index)
        stations_dict[station_id][RIDES] += rides_count
        station_df = station_df.sort_values(by=[starttime])
        if (station_df[starttime].iloc[0] < stations_dict[station_id][FIRST]):
            stations_dict[station_id][FIRST] = stations_df[starttime].iloc[0]
        if (station_df[starttime].iloc[-1] > stations_dict[station_id][LAST]):
            stations_dict[station_id][LAST] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

0 : handling file 201309-citibike-tripdata.zip
1 : handling file 201608-citibike-tripdata.zip
2 : handling file 201807-citibike-tripdata.csv.zip
3 : handling file 201708-citibike-tripdata.csv.zip
4 : handling file 201910-citibike-tripdata.csv.zip
5 : handling file 201905-citibike-tripdata.csv.zip
6 : handling file 201510-citibike-tripdata.zip
7 : handling file 201405-citibike-tripdata.zip
8 : handling file 201812-citibike-tripdata.csv.zip
in 201812-citibike-tripdata.csv.zip caught index error single positional indexer is out-of-bounds
9 : handling file 201506-citibike-tripdata.zip
10 : handling file 201712-citibike-tripdata.csv.zip
11 : handling file 201902-citibike-tripdata.csv.zip
in 201902-citibike-tripdata.csv.zip caught index error single positional indexer is out-of-bounds
12 : handling file 201512-citibike-tripdata.zip
13 : handling file 201411-citibike-tripdata.zip
14 : handling file 201808-citibike-tripdata.csv.zip
in 201808-citibike-tripdata.csv.zip caught index error single 

Unnamed: 0,id,name,lat,lon,first,last,rides
0,254.0,W 11 St & 6 Ave,40.735324,-73.998004,2013-06-01,2019-10-31,185436
1,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,2013-06-01,2019-10-31,475766
2,352.0,W 56 St & 6 Ave,40.763406,-73.977225,2013-06-01,2016-12-31,122203
3,490.0,8 Ave & W 33 St,40.751551,-73.993934,2013-06-01,2019-10-31,511839
4,236.0,St Marks Pl & 2 Ave,40.728419,-73.98714,2013-06-01,2019-10-31,311470


In [5]:
# Transform the stations_df
if CITY=='nyc':
    RIDES_COUNT_THRESHOLD=80
# Remove dummy stations (there are test stations in the data)
# Remove stations with less than RIDES_COUNT_THRESHOLD rides
bad_stations_df = stations_df[stations_df[RIDES] < RIDES_COUNT_THRESHOLD]
print('removing %d bad stations that each have less than %d rides from stations data' % (bad_stations_df.shape[0], RIDES_COUNT_THRESHOLD))
stations_df = stations_df[stations_df[RIDES] >= RIDES_COUNT_THRESHOLD]

removing 40 bad stations that each have less than 80 rides from stations data


In [6]:
bad_stations_df.head(10)

Unnamed: 0,id,name,lat,lon,first,last,rides
499,3250.0,NYCBS Depot - PIT,40.71691,-73.983838,2016-07-01,2019-03-31,48
532,3385.0,2 Ave & E 105 St,40.789817,-73.942961,2016-08-12,2016-08-12,1
587,3040.0,SSP Tech Workshop,40.646678,-74.016263,2016-07-01,2019-05-31,49
597,3036.0,8D OPS 01,40.662908,-73.999722,2016-07-01,2018-07-18,28
675,3485.0,NYCBS Depot - RIS,40.725208,-73.974724,2017-08-01,2018-07-31,45
856,3470.0,Expansion Tech Station,40.669802,-73.994905,2017-06-01,2017-08-18,9
857,3488.0,8D QC Station 01,45.506364,-73.569463,2017-08-22,2018-06-30,19
940,3849.0,Suydam St & St. Nicholas Ave,40.70636,-73.91945,2019-10-04,2019-10-31,78
953,3847.0,Cedar St & Evergreen Ave,40.69671,-73.92807,2019-10-11,2019-10-31,54
954,3851.0,Metropolitan Ave & Stewart Ave,40.71401,-73.92793,2019-10-12,2019-10-31,50


In [7]:
"""For the boston hubway/blue bikes data there will be duplicates because
when management changed from hubway to Bluebikes, the data fromat did too
This includes the station id/numbers and names AND lat/lon!
Task: deduplicate stations

Idea to understand data: sort the stations so the potential duplicates are next to each other
when merging/deduping data make sure to keep the earliest first and the latest last.

approach to deduplicating stations:
- normalize names and add new temporary column with normalized name
- get list of unique normalized names
- for each name:
    make a df for that name, sorted by [first, last]
    update main df to replace entries with that name with:
        first first
        last last
        last name
        rides as sum of rides
    sort main df by [name, first] and drop duplicates (duplicates on normalized name)
    remove normalized name column
"""

import re

NORMALIZED_NAME = 'normalized_name'

def normalized_station_name(name):
    normalized_name = name.lower()
    normalized_name = normalized_name.replace("former",  "").replace(" ", "")
    normalized_name = re.sub(r'[^a-z0-9]','', normalized_name)
    return normalized_name

if CITY == 'nyc':
    stations_df[NORMALIZED_NAME] = stations_df[NAME].apply(normalized_station_name)
    normalized_names = stations_df[NORMALIZED_NAME]
    print(normalized_names.shape[0], ' names')
    unique_normalized_names = stations_df[NORMALIZED_NAME].unique()
    print(unique_normalized_names.shape[0], ' unique normalized names') #, unique_normalized_names)


    n = 0
    for normalized_name in unique_normalized_names:
        print(n, 'handling name', normalized_name)
        n+=1
        name_df = stations_df[stations_df[NORMALIZED_NAME] == normalized_name]
        name_df.sort_values(by=[FIRST, LAST], inplace=True)
        first = name_df[FIRST].iloc[0]
        last = name_df[LAST].iloc[-1]
        name = name_df[NAME].iloc[-1]
        rides = name_df[RIDES].sum()
        update_condition = (stations_df[NORMALIZED_NAME] == normalized_name)
        stations_df.loc[update_condition, [FIRST, LAST, NAME, RIDES]] = first, last, name, rides

    stations_dropped_duplicates_df = stations_df.drop_duplicates(subset=[NORMALIZED_NAME])
    print('dropped %s rows based on duplicate names' % (int(stations_df.shape[0]) - int(stations_dropped_duplicates_df.shape[0])))
    stations_dropped_duplicates_df.drop(labels=[NORMALIZED_NAME], axis=1, inplace=True)
    
    stations_df = stations_dropped_duplicates_df


978  names
953  unique normalized names
0 handling name w11st6ave
1 handling name clevelandplspringst
2 handling name w56st6ave
3 handling name 8avew33st
4 handling name stmarkspl2ave
5 handling name frontstmaidenln
6 handling name frontstwashingtonst
7 handling name broadwayw39st
8 handling name e2stavenueb
9 handling name clermontaveparkave
10 handling name harrisonsthudsonst
11 handling name wytheavemetropolitanave
12 handling name pearlsthanoversquare
13 handling name w25st6ave
14 handling name w26st8ave
15 handling name fultonstwaverlyave
16 handling name broadwayw58st
17 handling name 2avee58st
18 handling name cliffstfultonst
19 handling name 6avew33st
20 handling name w20st7ave
21 handling name universityple14st
22 handling name e16st5ave
23 handling name e15st3ave
24 handling name e53stlexingtonave
25 handling name veseyplriverterrace
26 handling name sprucestnassaust
27 handling name e3st1ave
28 handling name stmarkspl1ave
29 handling name e55st2ave
30 handling name e43st2ave

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


 handling name e7stavenuea
34 handling name parkplchurchst
35 handling name greenwichave8ave
36 handling name w18st6ave
37 handling name 9avew22st
38 handling name wattsstgreenwichst
39 handling name broadwayw53st
40 handling name 9avew45st
41 handling name broadwayberryst
42 handling name henrystatlanticave
43 handling name broadwaye22st
44 handling name perrystbleeckerst
45 handling name leffertsplfranklinave
46 handling name centrestchambersst
47 handling name w17st8ave
48 handling name fdrdrivee35st
49 handling name broadwaye14st
50 handling name w52st5ave
51 handling name henrystgrandst
52 handling name w16stthehighline
53 handling name s4stwytheave
54 handling name washingtonstgansevoortst
55 handling name w4st7aves
56 handling name w52st9ave
57 handling name e11st2ave
58 handling name 1avee15st
59 handling name w31st7ave
60 handling name w34st11ave
61 handling name w41st8ave
62 handling name broadstbridgest
63 handling name washingtonpl6ave
64 handling name pershingsquares
65 ha

313 handling name williamstpinest
314 handling name bedfordaves9thst
315 handling name maconstnostrandave
316 handling name dekalbaveskillmanst
317 handling name maidenlnpearlst
318 handling name waterwhitehallplaza
319 handling name fultonstgrandave
320 handling name hanoverpllivingstonst
321 handling name greenwichavecharlesst
322 handling name stjamesplpearlst
323 handling name nassaustnavyst
324 handling name railroadavekayave
325 handling name 7avefarragutst
326 handling name flushingavecarltonave
327 handling name dekalbavehudsonave
328 handling name fultonstwilliamst
329 handling name shevchenkople6st
330 handling name n12stbedfordave
331 handling name tompkinsavehopkinsst
332 handling name mcguinnessblvdeaglest
333 handling name e84stparkave
334 handling name w13sthudsonst
335 handling name broadwaywhipplest
336 handling name riversidedrw78st
337 handling name n6stbedfordave
338 handling name normanaveleonardst
339 handling name 5avee73st
340 handling name 9avew28st
341 handlin

556 handling name clintonst4place
557 handling name berkeleypl6ave
558 handling name 5ave3st
559 handling name kanestclintonst
560 handling name 5st6ave
561 handling name 6ave9st
562 handling name 14st7ave
563 handling name 10st7ave
564 handling name 6st7ave
565 handling name 7st5ave
566 handling name courtststatest
567 handling name 6ave12st
568 handling name 12st4ave
569 handling name columbiastlorrainest
570 handling name 10st5ave
571 handling name henrystbayst
572 handling name carrollst5ave
573 handling name 3ave14st
574 handling name presidentsthenryst
575 handling name warrenstcourtst
576 handling name hoytstwarrenst
577 handling name degrawstsmithst
578 handling name clintonstcentrest
579 handling name deansthoytst
580 handling name prospectparkwest8st
581 handling name carrollst6ave
582 handling name carrollstsmithst
583 handling name courtstnelsonst
584 handling name 1plclintonst
585 handling name wolcottstdwightst
586 handling name columbiastw9st
587 handling name reedstvanb

836 handling name greenwichsthubertst
837 handling name schermerhornst3ave
838 handling name lexingtonavee36st
839 handling name w50st9ave
840 handling name bushwickavemckibbinst
841 handling name greenstmcguinnessblvd
842 handling name cadmanplazaejohnsonst
843 handling name cedarstmyrtleave
844 handling name 36ave31st
845 handling name avenuece18st
846 handling name centralparkwestw82st
847 handling name mckibbinstbogartst
848 handling name e11stavenueb
849 handling name e58st1avenwcorner
850 handling name willoughbyavewyckoffave
851 handling name 6avew34st
852 handling name froststmeekerave
853 handling name whitestmoorest
854 handling name 31stnewtownave
855 handling name w35stdyerave
856 handling name staggstmorganave
857 handling name georgestwilsonave
858 handling name centralavestarrstreet
859 handling name stantonstnorfolkst
860 handling name e12st4av
861 handling name greeneavmyrtleav
862 handling name 2avee72st
863 handling name pierrepontstmonroepl
864 handling name divisio

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
stations_df = stations_dropped_duplicates_df
stations_df.head(30)

Unnamed: 0,id,name,lat,lon,first,last,rides
0,254.0,W 11 St & 6 Ave,40.735324,-73.998004,2013-06-01,2019-10-31,185436
1,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,2013-06-01,2019-10-31,475766
2,352.0,W 56 St & 6 Ave,40.763406,-73.977225,2013-06-01,2016-12-31,122203
3,490.0,8 Ave & W 33 St,40.751551,-73.993934,2013-06-01,2019-10-31,511839
4,236.0,St Marks Pl & 2 Ave,40.728419,-73.98714,2013-06-01,2019-10-31,311470
5,351.0,Front St & Maiden Ln,40.70531,-74.006126,2013-06-01,2019-10-31,144733
6,2000.0,Front St & Washington St,40.702551,-73.989402,2013-06-01,2019-10-31,153386
7,533.0,Broadway & W 39 St,40.752996,-73.987216,2013-06-01,2019-10-31,205178
8,301.0,E 2 St & Avenue B,40.722174,-73.983688,2013-06-01,2019-10-31,262958
9,421.0,Clermont Ave & Park Ave,40.695734,-73.971297,2013-06-01,2019-10-31,38168


In [9]:
# Save the data to CSV
save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)
print('wrote data to ', save_to_csvfilename)

wrote data to  ../data/nyc-bike/stations.csv


In [10]:
# Save the data to JSON that will be used in web app
import json

stations = []
for index, row in stations_df.iterrows():
    # Transform the date
    date = row[5]
    
    stations.append({
        ID: str(row[ID]),
        NAME: row[NAME],
        LAT: row[LAT],
        LON: row[LON],
        FIRST: transform_date(row[FIRST]),
        LAST: transform_date(row[LAST]),
    })

json = json.dumps(stations)

save_to_jsonfilename = directory + 'stations.json'
with open(save_to_jsonfilename, 'w') as f:
    f.write(json)
print("Data written to stations.json")

Data written to stations.json


In [69]:
stations_df.head()

Unnamed: 0,id,name,lat,lon,first,last,rides
111,1,18 Dorrance Warehouse,42.387151,-71.075978,2015-03-01,2019-07-31,578
378,378,191 Beacon St,42.380323,-71.108786,2018-12-06,2019-07-31,3237
263,330,30 Dane St.,42.381001,-71.104025,2018-10-23,2019-07-31,5153
259,286,30 Dane St. (former),42.381123,-71.1041,2018-10-12,2018-10-31,170
425,M32026,359 Broadway - Broadway at Fayette Street,42.370803,-71.104412,2013-12-31,2014-07-01,13424
