In [8]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon | rides

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location
- rides is a count of the number of rides found in the data -- it is used to remove dummy stations in the data.
    only stations with more than RIDES_COUNT_THRESHOLD are included in output

This script is abstracte to apply to multiple cities.
DON'T FORGET: update the 'CITY' variable

"""
from datetime import datetime
import math
import os

import pandas as pd
from zipfile import ZipFile
# CITY = 'dc'
# CITY = 'boston'
# CITY = 'nyc'
CITY = 'chicago'
print('city', CITY)


RIDES_COUNT_THRESHOLD = 100


def get_filepath(city):
    return '../data/' + city + '-bike/'


def transform_date(date):
    try:
        dt = datetime.strptime(date.split(' ')[0], '%m/%d/%Y')
    except ValueError:
        # this dataset is so frustrating lol
        dt = datetime.strptime(date.split(' ')[0], '%Y-%m-%d')
        
    return dt.strftime('%Y-%m-%d')

def open_zipfile(zipfilename):
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(zipfilename)
    filenames = [f.filename for f in zipfile.infolist()]
    # Return the first file that can be opened  - not all of them have .csv suffix
    for filename in filenames:
        try:
            df = pd.read_csv(zipfile.open(filename))
            return df
        except:
            print('failed to open filename from zip', zipfilename, ': ', filename)
            pass
    raise Exception('unable to read a csv from zipfile %s' % zipfilename)

def open_zipfile_dc(zipfilename):
## DC bike files from 2012 to 2017 have 4 files for each quarter
## this generator yields each of those files

    zipfile = ZipFile(zipfilename)
    files = [f.filename for f in zipfile.infolist()]

    for filename in files:
        try:
            df = pd.read_csv(zipfile.open(filename))
            yield df
        except:
            continue

def open_zipfile_chicago(zipfilename):

    zipfile = ZipFile(zipfilename)
    files = [f.filename for f in zipfile.infolist()]
    for filename in files:
        if filename[:8] == '__MACOSX' or filename.endswith('txt') or 'Divvy_Stations_2' in filename:
            continue
        if '/' in filename:
            if '/Divvy_Stations' in filename:
                continue
        try:
            print(filename)
            df = pd.read_csv(zipfile.open(filename))
            yield df
        except:
            continue



city chicago


In [9]:
"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'


    
# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'
RIDES = 'rides'


In [10]:

def preprocess_stations_df(df):
    # Because someone can't make data files with uniform column names
    df.columns = df.columns.str.replace('number', 'id')  # 'Station Number' vs Station ID
    df.columns = df.columns.str.replace('date', 'time')  # 'Start Date' vs 'Start Time'
    
    
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('[\ ]', '')
    
    # transform the dates
    df[starttime] = df[starttime].apply(transform_date)
    if CITY == "boston":
        df = preprocess_boston_stations_df(df)
    return df


# Some of the earlier bostons stations data does not include lat,lon coordinates.
# These files contains the lat,lon coordinates (and other data) for station IDs
hubway_stations_locations_filenames = [
    "Hubway_Stations_as_of_July_2017.csv",
    "previous_Hubway_Stations_as_of_July_2017.csv"
]

def get_hubway_stations_locations_df():
    df = pd.DataFrame()
    filenames = [get_filepath(CITY) + fname for fname in hubway_stations_locations_filenames]
    for filename in filenames:
        new_df = pd.read_csv(filename)    
        hubway_stations_locations_column_names = {
            "Station ID": start_station_id,
            "Latitude": start_station_latitude,
            "Longitude": start_station_longitude,
        }
        # Rename the column names to match the rides data that the locations data will be joined with
        new_df.rename(columns=hubway_stations_locations_column_names, inplace=True)
        df = new_df if df.empty else df.append(new_df)
    df.drop_duplicates(subset=[start_station_id], inplace=True)
    return df

hubway_stations_locations_df = None
if CITY == "boston":
    hubway_stations_locations_df = get_hubway_stations_locations_df()


def preprocess_boston_stations_df(df):
    if start_station_latitude in df.columns:
        return df
    # Otherwise this is one of the datasets that is lacking lat, lon info.
    # Add the lat,lon info
    return hubway_stations_locations_df.merge(df, on=start_station_id)

def choose_chicago_columns(filename):
    if filename == 'Divvy_Trips_2018_Q1.zip' or filename == 'Divvy_Trips_2019_Q2.zip':
        return ('03-rentalstartstationid','03-rentalstartstationname', '', '', '01-rentaldetailslocalstarttime' )
    if filename[:5] == 'Divvy' and filename != 'Divvy_Trips_2020_Q1.zip':
        if filename[:7] == 'Divvy_S' or int(filename[12:16]) < 2017:
            return ('from_station_id', 'from_station_name', '', '', 'starttime')
        return ('from_station_id', 'from_station_name', '', '', 'start_time')
    return ('start_station_id', 'start_station_name', 'start_lat', 'start_lng', 'started_at')



# hubway_stations_locations_df.head()

In [11]:
FILENAMES_TO_IGNORE = hubway_stations_locations_filenames + ['stations.csv'] + ['stations.json'] # + [more bad filenames here]


def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: [],
        RIDES: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
        new_dict[RIDES].append(station_dict[RIDES])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()
needs_lat_lon = set()
directory = get_filepath(CITY)
files_count = 0

for filename in os.listdir(directory):
    print(filename)
    if filename in FILENAMES_TO_IGNORE:# or filename[:6] not in ['202102']:
        continue
    if filename[:4] in ['2021', '2022'] and filename[:6] not in ['202101'] and CITY == 'boston' or (filename[:4] in ['2020', '2021', '2022'] and filename[:6] not in ['202001', '202002', '202003']) and CITY == 'dc' :
        start_station_id = 'start_station_id'
        start_station_name = 'start_station_name'
        start_station_latitude = 'start_lat'
        start_station_longitude = 'start_lng'
        starttime = 'started_at'
    elif CITY != 'dc' and CITY != 'chicago':
        start_station_id = 'startstationid'
        start_station_name = 'startstationname'
        start_station_latitude = 'startstationlatitude'
        start_station_longitude = 'startstationlongitude'
        starttime = 'starttime'
    elif CITY == 'chicago':
        start_station_id, start_station_name, start_station_latitude, start_station_longitude, starttime = choose_chicago_columns(filename)
    else:
        start_station_id = 'startstationid'
        start_station_name = 'startstation'
        start_station_latitude = ""
        start_station_longitude = ""
        starttime = "starttime"


    fullfilename = directory + filename
    print(files_count, ': handling file', filename)
    files_count+=1
    
    if filename.endswith(".csv"):
        stations_dfs = [pd.read_csv(fullfilename)]
    elif filename.endswith(".zip") and CITY == 'ny':
        stations_dfs = [open_zipfile(fullfilename)]
    elif filename.endswith(".zip") and CITY == 'dc':
        stations_dfs = [df for df in open_zipfile_dc(fullfilename)]
    elif filename.endswith(".zip") and CITY == 'chicago':
        stations_dfs = [df for df in open_zipfile_chicago(fullfilename)]
    else:
        continue
    
    for stations_df in stations_dfs:

        stations_df = preprocess_stations_df(stations_df)
        
        unique_station_ids = stations_df[start_station_id].unique()
        for station_id in unique_station_ids:
            station_df = stations_df[stations_df[start_station_id] == station_id]
            
            if station_id not in stations_dict:
                try:
                    stations_dict[station_id] = {
                        NAME: station_df[start_station_name].iloc[0], 
                        LAT: station_df[start_station_latitude].iloc[0],
                        LON: station_df[start_station_longitude].iloc[0], 
                        FIRST: station_df[starttime].iloc[0], 
                        LAST: station_df[starttime].iloc[0],
                        RIDES: 0,
                    }
                except Exception as e:
                    
                    if type(e).__name__ == 'KeyError':
                        
                        stations_dict[station_id] = {
                            NAME: station_df[start_station_name].iloc[0], 
                            LAT: 0,
                            LON: 0,
                            FIRST: station_df[starttime].iloc[0], 
                            LAST: station_df[starttime].iloc[0],
                            RIDES: 0,
                        }
                        needs_lat_lon.add(station_id)
                    else:    
                        continue
            if station_id in needs_lat_lon and starttime == 'started_at':
                stations_dict[station_id][LAT] = station_df[start_station_latitude].iloc[0]
                stations_dict[station_id][LON] = station_df[start_station_longitude].iloc[0]
                needs_lat_lon.remove(station_id)
            rides_count = len(station_df.index)
            stations_dict[station_id][RIDES] += rides_count
            station_df = station_df.sort_values(by=[starttime])
            if (station_df[starttime].iloc[0] < stations_dict[station_id][FIRST]):
                stations_dict[station_id][FIRST] = stations_df[starttime].iloc[0]
            if (station_df[starttime].iloc[-1] > stations_dict[station_id][LAST]):
                stations_dict[station_id][LAST] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

202004-divvy-tripdata.zip
0 : handling file 202004-divvy-tripdata.zip
202004-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202005-divvy-tripdata.zip
1 : handling file 202005-divvy-tripdata.zip
202005-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202006-divvy-tripdata.zip
2 : handling file 202006-divvy-tripdata.zip
202006-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202007-divvy-tripdata.zip
3 : handling file 202007-divvy-tripdata.zip
202007-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202008-divvy-tripdata.zip
4 : handling file 202008-divvy-tripdata.zip
202008-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202009-divvy-tripdata.zip
5 : handling file 202009-divvy-tripdata.zip
202009-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202010-divvy-tripdata.zip
6 : handling file 202010-divvy-tripdata.zip
202010-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202011-divvy-tripdata.zip
7 : handling file 202011-divvy-tripdata.zip
202011-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202012-divvy-tripdata.zip
8 : handling file 202012-divvy-tripdata.zip
202012-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202101-divvy-tripdata.zip
9 : handling file 202101-divvy-tripdata.zip
202101-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202102-divvy-tripdata.zip
10 : handling file 202102-divvy-tripdata.zip
202102-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202103-divvy-tripdata.zip
11 : handling file 202103-divvy-tripdata.zip
202103-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202104-divvy-tripdata.zip
12 : handling file 202104-divvy-tripdata.zip
202104-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202105-divvy-tripdata.zip
13 : handling file 202105-divvy-tripdata.zip
202105-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202106-divvy-tripdata.zip
14 : handling file 202106-divvy-tripdata.zip
202106-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202107-divvy-tripdata.zip
15 : handling file 202107-divvy-tripdata.zip
202107-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202108-divvy-tripdata.zip
16 : handling file 202108-divvy-tripdata.zip
202108-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202109-divvy-tripdata.zip
17 : handling file 202109-divvy-tripdata.zip
202109-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202110-divvy-tripdata.zip
18 : handling file 202110-divvy-tripdata.zip
202110-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202111-divvy-tripdata.zip
19 : handling file 202111-divvy-tripdata.zip
202111-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202112-divvy-tripdata.zip
20 : handling file 202112-divvy-tripdata.zip
202112-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202201-divvy-tripdata.zip
21 : handling file 202201-divvy-tripdata.zip
202201-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202202-divvy-tripdata.zip
22 : handling file 202202-divvy-tripdata.zip
202202-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202203-divvy-tripdata.zip
23 : handling file 202203-divvy-tripdata.zip
202203-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202204-divvy-tripdata.zip
24 : handling file 202204-divvy-tripdata.zip
202204-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


202205-divvy-tripdata.zip
25 : handling file 202205-divvy-tripdata.zip
202205-divvy-tripdata.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Stations_Trips_2013.zip
26 : handling file Divvy_Stations_Trips_2013.zip
Divvy_Stations_Trips_2013/Divvy_Trips_2013.csv


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Stations_Trips_2014_Q1Q2.zip
27 : handling file Divvy_Stations_Trips_2014_Q1Q2.zip
Divvy_Trips_2014_Q1Q2.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Stations_Trips_2014_Q3Q4.zip
28 : handling file Divvy_Stations_Trips_2014_Q3Q4.zip
Divvy_Stations_Trips_2014_Q3Q4/Divvy_Trips_2014-Q3-07.csv
Divvy_Stations_Trips_2014_Q3Q4/Divvy_Trips_2014-Q3-0809.csv
Divvy_Stations_Trips_2014_Q3Q4/Divvy_Trips_2014-Q4.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2015-Q1Q2.zip
29 : handling file Divvy_Trips_2015-Q1Q2.zip
Divvy_Trips_2015-Q1.csv
Divvy_Trips_2015-Q2.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2015_Q3Q4.zip
30 : handling file Divvy_Trips_2015_Q3Q4.zip
Divvy_Trips_2015_Q4.csv
Divvy_Trips_2015_09.csv
Divvy_Trips_2015_08.csv
Divvy_Trips_2015_07.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2016_Q1Q2.zip
31 : handling file Divvy_Trips_2016_Q1Q2.zip
Divvy_Trips_2016_Q1Q2/Divvy_Trips_2016_04.csv
Divvy_Trips_2016_Q1Q2/Divvy_Trips_2016_05.csv
Divvy_Trips_2016_Q1Q2/Divvy_Trips_2016_06.csv
Divvy_Trips_2016_Q1Q2/Divvy_Trips_2016_Q1.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2016_Q3Q4.zip
32 : handling file Divvy_Trips_2016_Q3Q4.zip
Divvy_Trips_2016_Q3.csv
Divvy_Trips_2016_Q4.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2017_Q1Q2.zip
33 : handling file Divvy_Trips_2017_Q1Q2.zip
Divvy_Trips_2017_Q1.csv
Divvy_Trips_2017_Q2.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2017_Q3Q4.zip
34 : handling file Divvy_Trips_2017_Q3Q4.zip
Divvy_Trips_2017_Q3.csv
Divvy_Trips_2017_Q4.csv


  df.columns = df.columns.str.replace('[\ ]', '')
  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2018_Q1.zip
35 : handling file Divvy_Trips_2018_Q1.zip
Divvy_Trips_2018_Q1.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2018_Q2.zip
36 : handling file Divvy_Trips_2018_Q2.zip
Divvy_Trips_2018_Q2.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2018_Q3.zip
37 : handling file Divvy_Trips_2018_Q3.zip
Divvy_Trips_2018_Q3.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2018_Q4.zip
38 : handling file Divvy_Trips_2018_Q4.zip
Divvy_Trips_2018_Q4.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2019_Q1.zip
39 : handling file Divvy_Trips_2019_Q1.zip
Divvy_Trips_2019_Q1.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2019_Q2.zip
40 : handling file Divvy_Trips_2019_Q2.zip
Divvy_Trips_2019_Q2.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2019_Q3.zip
41 : handling file Divvy_Trips_2019_Q3.zip
Divvy_Trips_2019_Q3.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2019_Q4.zip
42 : handling file Divvy_Trips_2019_Q4.zip
Divvy_Trips_2019_Q4.csv


  df.columns = df.columns.str.replace('[\ ]', '')


Divvy_Trips_2020_Q1.zip
43 : handling file Divvy_Trips_2020_Q1.zip
Divvy_Trips_2020_Q1.csv


  df.columns = df.columns.str.replace('[\ ]', '')


stations.csv
stations.json


Unnamed: 0,id,name,lat,lon,first,last,rides
0,86,Eckhart Park,41.8964,-87.661,2013-06-27,2020-11-07,44827
1,503,Drake Ave & Fullerton Ave,41.9244,-87.7154,2015-06-30,2020-11-07,13529
2,142,McClurg Ct & Erie St,41.8945,-87.6179,2015-06-30,2020-11-07,86932
3,216,California Ave & Division St,41.903,-87.6975,2013-06-27,2020-11-07,19399
4,125,Rush St & Hubbard St,41.8902,-87.6262,2015-06-30,2020-11-07,62760


In [12]:
# Transform the stations_df

# Remove dummy stations (there are test stations in the data)
# Remove stations with less than RIDES_COUNT_THRESHOLD rides
bad_stations_df = stations_df[stations_df[RIDES] < RIDES_COUNT_THRESHOLD]
print('removing %d bad stations that each have less than %d rides from stations data' % (bad_stations_df.shape[0], RIDES_COUNT_THRESHOLD))
stations_df = stations_df[stations_df[RIDES] >= RIDES_COUNT_THRESHOLD]
## Remove stations that do not have a latitude and longitude measure
bad_stations_2 = stations_df[stations_df[LAT] == 0]
print(f'removing {bad_stations_2.shape[0]} more stations for not having latitude or longitude')
stations_df = stations_df[stations_df[LAT] != 0]

removing 370 bad stations that each have less than 100 rides from stations data
removing 16 more stations for not having latitude or longitude


In [13]:
bad_stations_df.head(10)

Unnamed: 0,id,name,lat,lon,first,last,rides
545,562.0,Racine Ave & 61st St,41.7832,-87.6544,2016-09-30,2020-10-31,59
572,651.0,Michigan Ave & 71st St,41.7653,-87.6217,2018-10-01,2020-06-06,24
593,648.0,Carpenter St & 63rd St,41.7799,-87.6509,2018-10-01,2020-11-07,40
603,652.0,Rhodes Ave & 71st St,41.766,-87.6117,2019-04-01,2020-11-07,74
609,647.0,Elizabeth St & 59th St,41.7867,-87.6559,2019-01-01,2020-10-31,36
611,665.0,South Chicago Ave & Elliot Ave,41.747363,-87.580046,2019-01-01,2020-11-07,44
612,681.0,Halsted St & 78th St,41.752487,-87.643902,2020-07-09,2020-11-07,54
613,674.0,Michigan Ave & 71st St,41.765286,-87.621748,2020-07-09,2020-11-07,92
614,677.0,Stewart Ave & 83rd St,41.743717,-87.634088,2020-07-09,2020-11-07,62
616,679.0,Ashland Ave & 73rd St,41.761225,-87.663361,2020-07-31,2020-11-07,33


In [None]:
"""For the boston hubway/blue bikes data there will be duplicates because
when management changed from hubway to Bluebikes, the data fromat did too
This includes the station id/numbers and names AND lat/lon!
Task: deduplicate stations

Idea to understand data: sort the stations so the potential duplicates are next to each other
when merging/deduping data make sure to keep the earliest first and the latest last.

approach to deduplicating stations:
- normalize names and add new temporary column with normalized name
- get list of unique normalized names
- for each name:
    make a df for that name, sorted by [first, last]
    update main df to replace entries with that name with:
        first first
        last last
        last name
        rides as sum of rides
    sort main df by [name, first] and drop duplicates (duplicates on normalized name)
    remove normalized name column
"""

import re

NORMALIZED_NAME = 'normalized_name'

def normalized_station_name(name):
    normalized_name = name.lower()
    normalized_name = normalized_name.replace("former",  "").replace(" ", "")
    normalized_name = re.sub(r'[^a-z0-9]','', normalized_name)
    return normalized_name

if CITY == 'boston':
    stations_df[NORMALIZED_NAME] = stations_df[NAME].apply(normalized_station_name)
    normalized_names = stations_df[NORMALIZED_NAME]
    print(normalized_names.shape[0], ' names')
    unique_normalized_names = stations_df[NORMALIZED_NAME].unique()
    print(unique_normalized_names.shape[0], ' unique normalized names') #, unique_normalized_names)


    n = 0
    for normalized_name in unique_normalized_names:
        print(n, 'handling name', normalized_name)
        n+=1
        name_df = stations_df[stations_df[NORMALIZED_NAME] == normalized_name]
        name_df.sort_values(by=[FIRST, LAST], inplace=True)
        first = name_df[FIRST].iloc[0]
        last = name_df[LAST].iloc[-1]
        name = name_df[NAME].iloc[-1]
        rides = name_df[RIDES].sum()
        update_condition = (stations_df[NORMALIZED_NAME] == normalized_name)
        stations_df.loc[update_condition, [FIRST, LAST, NAME, RIDES]] = first, last, name, rides

    stations_dropped_duplicates_df = stations_df.drop_duplicates(subset=[NORMALIZED_NAME])
    print('dropped %s rows based on duplicate names' % (int(stations_df.shape[0]) - int(stations_dropped_duplicates_df.shape[0])))
    stations_dropped_duplicates_df.drop(labels=[NORMALIZED_NAME], axis=1, inplace=True)
    
    stations_df = stations_dropped_duplicates_df


In [14]:
# Save the data to CSV
save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)
print('wrote data to ', save_to_csvfilename)

wrote data to  ../data/chicago-bike/stations.csv


In [15]:
# Save the data to JSON that will be used in web app
import json

stations = []
for index, row in stations_df.iterrows():
    # Transform the date
    date = row[5]
    
    stations.append({
        ID: str(row[ID]),
        NAME: row[NAME],
        LAT: row[LAT],
        LON: row[LON],
        FIRST: transform_date(row[FIRST]),
        LAST: transform_date(row[LAST]),
    })

json = json.dumps(stations)

save_to_jsonfilename = directory + 'stations.json'
with open(save_to_jsonfilename, 'w') as f:
    f.write(json)
print("Data written to stations.json")

Data written to stations.json


In [None]:
stations_df.head()

Unnamed: 0,id,name,lat,lon,first,last,rides
0,31208,M St & New Jersey Ave SE,38.8763,-77.0037,2010-09-20,2022-05-29,96870
1,31209,1st & N St SE,38.8743,-77.0057,2010-09-20,2022-05-29,71502
2,31600,5th & K St NW,38.90304,-77.019027,2010-09-20,2022-05-29,271413
3,31100,19th St & Pennsylvania Ave NW,38.9003,-77.0429,2010-09-20,2022-05-29,106021
4,31109,7th & T St NW,38.9155,-77.0222,2010-09-20,2022-05-29,192171
