In [7]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon | rides

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location
- rides is a count of the number of rides found in the data -- it is used to remove dummy stations in the data.
    only stations with more than RIDES_COUNT_THRESHOLD are included in output

This script is abstracte to apply to multiple cities.
DON'T FORGET: update the 'CITY' variable

"""
from datetime import datetime
import math
import os

import pandas as pd
from zipfile import ZipFile

CITY = 'boston'
# CITY = 'nyc'
print('city', CITY)


RIDES_COUNT_THRESHOLD = 100


def get_filepath(city):
    return '../data/' + city + '-bike/'


def transform_date(date):
    try:
        dt = datetime.strptime(date.split(' ')[0], '%m/%d/%Y')
    except ValueError:
        # this dataset is so frustrating lol
        dt = datetime.strptime(date.split(' ')[0], '%Y-%m-%d')
        
    return dt.strftime('%Y-%m-%d')

def open_zipfile(zipfilename):
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(zipfilename)
    filenames = [f.filename for f in zipfile.infolist()]
    # Return the first file that can be opened  - not all of them have .csv suffix
    for filename in filenames:
        try:
            df = pd.read_csv(zipfile.open(filename))
            return df
        except:
            print('failed to open filename from zip', zipfilename, ': ', filename)
            pass
    raise Exception('unable to read a csv from zipfile %s' % zipfilename)


city boston


In [8]:
"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'


    
# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'
RIDES = 'rides'


In [48]:

def preprocess_stations_df(df):
    # Because someone can't make data files with uniform column names
    df.columns = df.columns.str.replace('number', 'id')  # 'Station Number' vs Station ID
    df.columns = df.columns.str.replace('date', 'time')  # 'Start Date' vs 'Start Time'
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('[\ ]', '')
    # transform the dates
    df[starttime] = df[starttime].apply(transform_date)
    if CITY == "boston":
        df = preprocess_boston_stations_df(df)
    return df


# Some of the earlier bostons stations data does not include lat,lon coordinates.
# These files contains the lat,lon coordinates (and other data) for station IDs
hubway_stations_locations_filenames = [
    "Hubway_Stations_as_of_July_2017.csv",
    "previous_Hubway_Stations_as_of_July_2017.csv"
]

def get_hubway_stations_locations_df():
    df = pd.DataFrame()
    filenames = [get_filepath(CITY) + fname for fname in hubway_stations_locations_filenames]
    for filename in filenames:
        new_df = pd.read_csv(filename)    
        hubway_stations_locations_column_names = {
            "Station ID": start_station_id,
            "Latitude": start_station_latitude,
            "Longitude": start_station_longitude,
        }
        # Rename the column names to match the rides data that the locations data will be joined with
        new_df.rename(columns=hubway_stations_locations_column_names, inplace=True)
        df = new_df if df.empty else df.append(new_df)
    df.drop_duplicates(subset=[start_station_id], inplace=True)
    return df

hubway_stations_locations_df = None
if CITY == "boston":
    hubway_stations_locations_df = get_hubway_stations_locations_df()


def preprocess_boston_stations_df(df):
    if start_station_latitude in df.columns:
        return df
    # Otherwise this is one of the datasets that is lacking lat, lon info.
    # Add the lat,lon info
    return hubway_stations_locations_df.merge(df, on=start_station_id)


print('hubwaystations shape', hubway_stations_locations_df.shape)
hubway_stations_locations_df.head()

hubwaystations shape (283, 7)


Unnamed: 0,# of Docks,Municipality,Station,publiclyExposed,startstationid,startstationlatitude,startstationlongitude
0,18,Boston,175 N Harvard St,1.0,A32019,42.363796,-71.129164
1,15,Somerville,191 Beacon St,1.0,S32035,42.380323,-71.108786
2,15,Somerville,30 Dane St.,1.0,S32023,42.381001,-71.104025
3,23,Cambridge,359 Broadway - Broadway at Fayette Street,1.0,M32026,42.370803,-71.104412
4,25,Cambridge,699 Mt Auburn St,1.0,M32054,42.375002,-71.148716


In [53]:
FILENAMES_TO_IGNORE = hubway_stations_locations_filenames  # + [more bad filenames here]


def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: [],
        RIDES: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
        new_dict[RIDES].append(station_dict[RIDES])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()

directory = get_filepath(CITY)

for filename in os.listdir(directory):
    if filename in FILENAMES_TO_IGNORE:
        continue
        
    fullfilename = directory + filename
    print(files_count, ': handling file', filename)
    
    if filename.endswith(".csv"):
        stations_df = pd.read_csv(fullfilename)
    elif filename.endswith(".zip"):
        stations_df = open_zipfile(fullfilename)
    else:
        continue
    
    stations_df = preprocess_stations_df(stations_df)

    unique_station_ids = stations_df[start_station_id].unique()
    for station_id in unique_station_ids:
        station_df = stations_df[stations_df[start_station_id] == station_id]
        
        if station_id not in stations_dict:
            stations_dict[station_id] = {
                NAME: station_df[start_station_name].iloc[0], 
                LAT: station_df[start_station_latitude].iloc[0],
                LON: station_df[start_station_longitude].iloc[0], 
                FIRST: station_df[starttime].iloc[0], 
                LAST: station_df[starttime].iloc[0],
                RIDES: 0,
            }
        rides_count = len(station_df.index)
        stations_dict[station_id][RIDES] += rides_count
        station_df = station_df.sort_values(by=[starttime])
        if (station_df[starttime].iloc[0] < stations_dict[station_id][FIRST]):
            stations_dict[station_id][FIRST] = stations_df[starttime].iloc[0]
        if (station_df[starttime].iloc[-1] > stations_dict[station_id][LAST]):
            stations_dict[station_id][LAST] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

0 : handling file 201512-hubway-tripdata.zip
preprocess_boston_stations_df
1 : handling file 201508-hubway-tripdata.zip
preprocess_boston_stations_df
2 : handling file 201511-hubway-tripdata.zip
preprocess_boston_stations_df
3 : handling file 201509-hubway-tripdata.zip
preprocess_boston_stations_df
4 : handling file 201510-hubway-tripdata.zip
preprocess_boston_stations_df
5 : handling file 201712-hubway-tripdata.zip
preprocess_boston_stations_df
6 : handling file 201810-bluebikes-tripdata.zip
preprocess_boston_stations_df
7 : handling file .DS_Store
7 : handling file 201709-hubway-tripdata.zip
preprocess_boston_stations_df
8 : handling file 201710-hubway-tripdata.zip
preprocess_boston_stations_df
9 : handling file 201708-hubway-tripdata.zip
preprocess_boston_stations_df
10 : handling file 201711-hubway-tripdata.zip
preprocess_boston_stations_df
11 : handling file 201501-hubway-tripdata.zip
preprocess_boston_stations_df
12 : handling file 201705-hubway-tripdata.zip
preprocess_boston_sta

Unnamed: 0,id,name,lat,lon,first,last,rides
0,9,Agganis Arena - 925 Comm Ave.,42.351246,-71.115639,2015-04-01,2019-07-31,46389
1,41,Packard's Corner - Comm. Ave. at Brighton Ave.,42.352261,-71.123831,2015-04-01,2019-07-31,44520
2,75,Lafayette Square at Mass Ave / Main St / Colum...,42.363465,-71.100573,2015-01-01,2019-07-31,61988
3,68,Central Square at Mass Ave / Essex St,42.36507,-71.1031,2015-01-01,2019-07-31,134736
4,36,Boston Public Library - 700 Boylston St.,42.349673,-71.077303,2015-04-01,2019-07-31,92076


In [54]:
# Transform the stations_df

# Remove dummy stations (there are test stations in the data)
# Remove stations with less than RIDES_COUNT_THRESHOLD rides
bad_stations_df = stations_df[stations_df[RIDES] < RIDES_COUNT_THRESHOLD]
print('removing %d bad stations that each have less than %d rides from stations data' % (bad_stations_df.shape[0], RIDES_COUNT_THRESHOLD))
stations_df = stations_df[stations_df[RIDES] >= RIDES_COUNT_THRESHOLD]

removing 36 bad stations that each have less than 100 rides from stations data


In [55]:
bad_stations_df.head(10)

Unnamed: 0,id,name,lat,lon,first,last,rides
203,306,Blue Hill Ave at Southwood St (former),42.31784,-71.07796,2018-10-01,2018-10-31,17
233,257,Codman Square Library (former),42.287361,-71.071111,2018-08-01,2018-10-31,67
235,293,Bartlett St at John Elliot Sq (former),42.329463,-71.090158,2018-09-01,2018-10-31,55
241,274,Bennington St at Constitution Beach (former),42.385249,-71.010601,2018-09-01,2018-10-31,30
244,256,Washington St at Talbot Ave (former),42.290333,-71.071806,2018-08-01,2018-10-31,97
245,263,Mattapan Library (former),42.277389,-71.09325,2018-08-01,2018-10-31,67
246,246,Roslindale Village - Washington St (former),42.286153,-71.128374,2018-08-01,2018-10-31,70
247,254,Talbot Ave At Blue Hill Ave (former),42.294583,-71.087111,2018-09-01,2018-10-31,40
250,245,Blue Hill Ave at Almont St (former),42.274545,-71.09372,2018-08-01,2018-10-31,37
251,253,Thetford Ave at Norfolk St (former),42.286251,-71.079463,2018-09-01,2018-10-31,45


In [114]:
"""For the boston hubway/blue bikes data there will be duplicates because
when management changed from hubway to Bluebikes, the data fromat did too
This includes the station id/numbers and names AND lat/lon!
Task: deduplicate stations

Idea to understand data: sort the stations so the potential duplicates are next to each other
when merging/deduping data make sure to keep the earliest first and the latest last.

approach to deduplicating stations:
- normalize names and add new temporary column with normalized name
- get list of unique normalized names
- for each name:
    make a df for that name, sorted by [first, last]
    update main df to replace entries with that name with:
        first first
        last last
        last name
        rides as sum of rides
    sort main df by [name, first] and drop duplicates (duplicates on normalized name)
    remove normalized name column
"""

import re

NORMALIZED_NAME = 'normalized_name'

def normalized_station_name(name):
    normalized_name = name.lower()
    normalized_name = normalized_name.replace("former",  "").replace(" ", "")
    normalized_name = re.sub(r'[^a-z0-9]','', normalized_name)
    return normalized_name

if CITY == 'boston':
    stations_df[NORMALIZED_NAME] = stations_df[NAME].apply(normalized_station_name)
    normalized_names = stations_df[NORMALIZED_NAME]
    print(normalized_names.shape[0], ' names')
    unique_normalized_names = stations_df[NORMALIZED_NAME].unique()
    print(unique_normalized_names.shape[0], ' unique normalized names') #, unique_normalized_names)


    n = 0
    for normalized_name in unique_normalized_names:
        print(n, 'handling name', normalized_name)
        n+=1
        name_df = stations_df[stations_df[NORMALIZED_NAME] == normalized_name]
        name_df.sort_values(by=[FIRST, LAST], inplace=True)
        first = name_df[FIRST].iloc[0]
        last = name_df[LAST].iloc[-1]
        name = name_df[NAME].iloc[-1]
        rides = name_df[RIDES].sum()
        update_condition = (stations_df[NORMALIZED_NAME] == normalized_name)
        stations_df.loc[update_condition, [FIRST, LAST, NAME, RIDES]] = first, last, name, rides

    stations_dropped_duplicates_df = stations_df.drop_duplicates(subset=[NORMALIZED_NAME])
    print('dropped %s rows based on duplicate names' % (int(stations_df.shape[0]) - int(stations_dropped_duplicates_df.shape[0])))
    stations_dropped_duplicates_df.drop(labels=[NORMALIZED_NAME], axis=1, inplace=True)
    
    stations_df = stations_dropped_duplicates_df


465  names
304  unique normalized names
0 handling name 18dorrancewarehouse
1 handling name 191beaconst
2 handling name 30danest
3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 handling name 359broadwaybroadwayatfayettestreet
4 handling name 699mtauburnst
5 handling name 700huronave
6 handling name 75binneyst
7 handling name 84cambridgeparkdr
8 handling name agganisarena925commave
9 handling name airporttstopbremenstatbrooksst
10 handling name albanystatebrooklinest
11 handling name alewifembtaatsteelplace
12 handling name alewifestationatrussellfield
13 handling name allstongreendistrictcommonwealthavegriggsst
14 handling name amesstatbroadway
15 handling name amesstatmainst
16 handling name andrewstationdorchesteraveathumboldtpl
17 handling name aquariumstation200atlanticave
18 handling name archdalerdatwashingtonst
19 handling name assemblysquaret
20 handling name bucentral725commave
21 handling name bcbshingham
22 handling name bcbsquincy
23 handling name bidmcbrooklineatburlingtonst
24 handling name backbaysouthendstation
25 handling name ballsq
26 handling name bartlettstatjohnelliotsq
27 handling name beaconstmassave
28 handling name beaconstatwashing

198 handling name mavericksquarelewismall
199 handling name mayormartinjwalsh28statest
200 handling name medfordstatcharlestownbcyf
201 handling name milkstatindiast
202 handling name mtauburn
203 handling name mtpleasantavedudleytowncommon
204 handling name murphyskatingrink1880dayblvd
205 handling name museumofscience
206 handling name ncaaawalnutaveatcrawfordst
207 handling name nashuastreetatredauerbachway
208 handling name newbalance20guestst
209 handling name newbalancestoreboylstonatdartmouth
210 handling name newburystherefordst
211 handling name newmarketsquare
212 handling name normanstatkelvinst
213 handling name northeasternunorthparkinglot
214 handling name oaksquare615washingtonst
215 handling name onebrighamcircle
216 handling name onebroadwaykendallsqatmainst3rdst
217 handling name onekendallsquareathampshirestportlandst
218 handling name onememorialdrive
219 handling name orientheightststopbenningtonstatsaratogast
220 handling name overlandstatbrooklineave
221 handling

Unnamed: 0,id,name,lat,lon,first,last,rides
111,1,18 Dorrance Warehouse,42.387151,-71.075978,2015-03-01,2019-07-31,578
378,378,191 Beacon St,42.380323,-71.108786,2018-12-06,2019-07-31,3237
263,330,30 Dane St.,42.381001,-71.104025,2018-10-12,2019-07-31,5323
425,M32026,359 Broadway - Broadway at Fayette Street,42.370803,-71.104412,2013-12-31,2019-07-31,61378
266,333,699 Mt Auburn St,42.375002,-71.148716,2018-08-01,2019-07-31,3257


In [117]:
stations_df = stations_dropped_duplicates_df
stations_df.head(30)

Unnamed: 0,id,name,lat,lon,first,last,rides
111,1,18 Dorrance Warehouse,42.387151,-71.075978,2015-03-01,2019-07-31,578
378,378,191 Beacon St,42.380323,-71.108786,2018-12-06,2019-07-31,3237
263,330,30 Dane St.,42.381001,-71.104025,2018-10-12,2019-07-31,5323
425,M32026,359 Broadway - Broadway at Fayette Street,42.370803,-71.104412,2013-12-31,2019-07-31,61378
266,333,699 Mt Auburn St,42.375002,-71.148716,2018-08-01,2019-07-31,3257
312,371,700 Huron Ave,42.380788,-71.154129,2018-11-07,2019-07-31,1073
482,398,75 Binney St,42.365507,-71.080138,2019-07-02,2019-07-31,749
261,319,84 Cambridgepark Dr,42.3936,-71.143941,2018-10-18,2019-07-31,826
335,A32002,Agganis Arena - 925 Comm Ave.,42.351692,-71.119035,2011-11-30,2019-07-31,73214
189,214,Airport T Stop - Bremen St at Brooks St,42.375339,-71.031304,2016-09-01,2019-07-31,3909


In [118]:
# Save the data to CSV
save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)
print('wrote data to ', save_to_csvfilename)

wrote data to  ../data/boston-bike/stations.csv


In [120]:
# Save the data to JSON that will be used in web app
import json

stations = []
for index, row in stations_df.iterrows():
    # Transform the date
    date = row[5]
    
    stations.append({
        ID: str(row[ID]),
        NAME: row[NAME],
        LAT: row[LAT],
        LON: row[LON],
        FIRST: transform_date(row[FIRST]),
        LAST: transform_date(row[LAST]),
    })

json = json.dumps(stations)

save_to_jsonfilename = directory + 'stations.json'
with open(save_to_jsonfilename, 'w') as f:
    f.write(json)
print("Data written to stations.json")

Data written to stations.json


In [69]:
stations_df.head()

Unnamed: 0,id,name,lat,lon,first,last,rides
111,1,18 Dorrance Warehouse,42.387151,-71.075978,2015-03-01,2019-07-31,578
378,378,191 Beacon St,42.380323,-71.108786,2018-12-06,2019-07-31,3237
263,330,30 Dane St.,42.381001,-71.104025,2018-10-23,2019-07-31,5153
259,286,30 Dane St. (former),42.381123,-71.1041,2018-10-12,2018-10-31,170
425,M32026,359 Broadway - Broadway at Fayette Street,42.370803,-71.104412,2013-12-31,2014-07-01,13424
