In [None]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon | rides

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location
- rides is a count of the number of rides found in the data -- it is used to remove dummy stations in the data.
    only stations with more than RIDES_COUNT_THRESHOLD are included in output

This script is abstracte to apply to multiple cities.
DON'T FORGET: update the 'CITY' variable

"""
from datetime import datetime
import math
import os

import pandas as pd
from zipfile import ZipFile

# CITY = 'boston'
CITY = 'nyc'
print('city', CITY)


RIDES_COUNT_THRESHOLD = 100


def get_filepath(city):
    return '../data/' + city + '-bike/'


def transform_date(date):
    try:
        dt = datetime.strptime(date.split(' ')[0], '%m/%d/%Y')
    except ValueError:
        # this dataset is so frustrating lol
        dt = datetime.strptime(date.split(' ')[0], '%Y-%m-%d')
        
    return dt.strftime('%Y-%m-%d')

def open_zipfile(zipfilename):
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(zipfilename)
    filenames = [f.filename for f in zipfile.infolist()]
    # Return the first file that can be opened  - not all of them have .csv suffix
    for filename in filenames:
        try:
            df = pd.read_csv(zipfile.open(filename))
            return df
        except:
            print('failed to open filename from zip', zipfilename, ': ', filename)
            pass
    raise Exception('unable to read a csv from zipfile %s' % zipfilename)


In [None]:
"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'


    
# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'
RIDES = 'rides'


In [28]:

def preprocess_stations_df(df):
    # Because someone can't make data files with uniform column names
    df.columns = df.columns.str.replace('number', 'id')  # 'Station Number' vs Station ID
    df.columns = df.columns.str.replace('date', 'time')  # 'Start Date' vs 'Start Time'
    
    
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace('[\ ]', '')
    
    # transform the dates
    df[starttime] = df[starttime].apply(transform_date)
    if CITY == "boston":
        df = preprocess_boston_stations_df(df)
    return df


# Some of the earlier bostons stations data does not include lat,lon coordinates.
# These files contains the lat,lon coordinates (and other data) for station IDs
hubway_stations_locations_filenames = [
    "Hubway_Stations_as_of_July_2017.csv",
    "previous_Hubway_Stations_as_of_July_2017.csv"
]

def get_hubway_stations_locations_df():
    df = pd.DataFrame()
    filenames = [get_filepath(CITY) + fname for fname in hubway_stations_locations_filenames]
    for filename in filenames:
        new_df = pd.read_csv(filename)    
        hubway_stations_locations_column_names = {
            "Station ID": start_station_id,
            "Latitude": start_station_latitude,
            "Longitude": start_station_longitude,
        }
        # Rename the column names to match the rides data that the locations data will be joined with
        new_df.rename(columns=hubway_stations_locations_column_names, inplace=True)
        df = new_df if df.empty else df.append(new_df)
    df.drop_duplicates(subset=[start_station_id], inplace=True)
    return df

hubway_stations_locations_df = None
if CITY == "boston":
    hubway_stations_locations_df = get_hubway_stations_locations_df()


def preprocess_boston_stations_df(df):
    if start_station_latitude in df.columns:
        return df
    # Otherwise this is one of the datasets that is lacking lat, lon info.
    # Add the lat,lon info
    return hubway_stations_locations_df.merge(df, on=start_station_id)



# hubway_stations_locations_df.head()

In [31]:
FILENAMES_TO_IGNORE = hubway_stations_locations_filenames + ['stations.csv'] + ['stations.json'] # + [more bad filenames here]


def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: [],
        RIDES: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
        new_dict[RIDES].append(station_dict[RIDES])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()

directory = get_filepath(CITY)
files_count = 0

for filename in os.listdir(directory):
    print(filename)
    if filename in FILENAMES_TO_IGNORE:# or filename[:6] not in ['202102']:
        continue
    if filename[:4] in ['2021', '2022'] and filename[:6] not in ['202101']:
        start_station_id = 'start_station_id'
        start_station_name = 'start_station_name'
        start_station_latitude = 'start_lat'
        start_station_longitude = 'start_lng'
        starttime = 'started_at'
    else:
        start_station_id = 'startstationid'
        start_station_name = 'startstationname'
        start_station_latitude = 'startstationlatitude'
        start_station_longitude = 'startstationlongitude'
        starttime = 'starttime'


    fullfilename = directory + filename
    print(files_count, ': handling file', filename)
    files_count+=1
    
    if filename.endswith(".csv"):
        stations_df = pd.read_csv(fullfilename)
    elif filename.endswith(".zip"):
        stations_df = open_zipfile(fullfilename)
    else:
        continue
    
    
    stations_df = preprocess_stations_df(stations_df)
    
    unique_station_ids = stations_df[start_station_id].unique()
    for station_id in unique_station_ids:
        station_df = stations_df[stations_df[start_station_id] == station_id]
        
        if station_id not in stations_dict:
            try:
                stations_dict[station_id] = {
                    NAME: station_df[start_station_name].iloc[0], 
                    LAT: station_df[start_station_latitude].iloc[0],
                    LON: station_df[start_station_longitude].iloc[0], 
                    FIRST: station_df[starttime].iloc[0], 
                    LAST: station_df[starttime].iloc[0],
                    RIDES: 0,
                }
            except Exception as e:
                print(e)
                continue
        rides_count = len(station_df.index)
        stations_dict[station_id][RIDES] += rides_count
        station_df = station_df.sort_values(by=[starttime])
        if (station_df[starttime].iloc[0] < stations_dict[station_id][FIRST]):
            stations_dict[station_id][FIRST] = stations_df[starttime].iloc[0]
        if (station_df[starttime].iloc[-1] > stations_dict[station_id][LAST]):
            stations_dict[station_id][LAST] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

.gitignore
0 : handling file .gitignore
.keep
1 : handling file .keep
201306-citibike-tripdata.zip
2 : handling file 201306-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201307-citibike-tripdata.zip
3 : handling file 201307-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201308-citibike-tripdata.zip
4 : handling file 201308-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201309-citibike-tripdata.zip
5 : handling file 201309-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201310-citibike-tripdata.zip
6 : handling file 201310-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201311-citibike-tripdata.zip
7 : handling file 201311-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201312-citibike-tripdata.zip
8 : handling file 201312-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201401-citibike-tripdata.zip
9 : handling file 201401-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201402-citibike-tripdata.zip
10 : handling file 201402-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201403-citibike-tripdata.zip
11 : handling file 201403-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201404-citibike-tripdata.zip
12 : handling file 201404-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201405-citibike-tripdata.zip
13 : handling file 201405-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201406-citibike-tripdata.zip
14 : handling file 201406-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201407-citibike-tripdata.zip
15 : handling file 201407-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201408-citibike-tripdata.zip
16 : handling file 201408-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201409-citibike-tripdata.zip
17 : handling file 201409-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201410-citibike-tripdata.zip
18 : handling file 201410-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201411-citibike-tripdata.zip
19 : handling file 201411-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201412-citibike-tripdata.zip
20 : handling file 201412-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201501-citibike-tripdata.zip
21 : handling file 201501-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201502-citibike-tripdata.zip
22 : handling file 201502-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201503-citibike-tripdata.zip
23 : handling file 201503-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201504-citibike-tripdata.zip
24 : handling file 201504-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201505-citibike-tripdata.zip
25 : handling file 201505-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201506-citibike-tripdata.zip
26 : handling file 201506-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201507-citibike-tripdata.zip
27 : handling file 201507-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201508-citibike-tripdata.zip
28 : handling file 201508-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201509-citibike-tripdata.zip
29 : handling file 201509-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201510-citibike-tripdata.zip
30 : handling file 201510-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201511-citibike-tripdata.zip
31 : handling file 201511-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201512-citibike-tripdata.zip
32 : handling file 201512-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201601-citibike-tripdata.zip
33 : handling file 201601-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201602-citibike-tripdata.zip
34 : handling file 201602-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201603-citibike-tripdata.zip
35 : handling file 201603-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201604-citibike-tripdata.zip
36 : handling file 201604-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201605-citibike-tripdata.zip
37 : handling file 201605-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201606-citibike-tripdata.zip
38 : handling file 201606-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201607-citibike-tripdata.zip
39 : handling file 201607-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201608-citibike-tripdata.zip
40 : handling file 201608-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201609-citibike-tripdata.zip
41 : handling file 201609-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201610-citibike-tripdata.zip
42 : handling file 201610-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201611-citibike-tripdata.zip
43 : handling file 201611-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201612-citibike-tripdata.zip
44 : handling file 201612-citibike-tripdata.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201701-citibike-tripdata.csv.zip
45 : handling file 201701-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201702-citibike-tripdata.csv.zip
46 : handling file 201702-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201703-citibike-tripdata.csv.zip
47 : handling file 201703-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201704-citibike-tripdata.csv.zip
48 : handling file 201704-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201705-citibike-tripdata.csv.zip
49 : handling file 201705-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201706-citibike-tripdata.csv.zip
50 : handling file 201706-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201707-citibike-tripdata.csv.zip
51 : handling file 201707-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201708-citibike-tripdata.csv.zip
52 : handling file 201708-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201709-citibike-tripdata.csv.zip
53 : handling file 201709-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201710-citibike-tripdata.csv.zip
54 : handling file 201710-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201711-citibike-tripdata.csv.zip
55 : handling file 201711-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201712-citibike-tripdata.csv.zip
56 : handling file 201712-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201801-citibike-tripdata.csv.zip
57 : handling file 201801-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201802-citibike-tripdata.csv.zip
58 : handling file 201802-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201803-citibike-tripdata.csv.zip
59 : handling file 201803-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201804-citibike-tripdata.csv.zip
60 : handling file 201804-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201805-citibike-tripdata.csv.zip
61 : handling file 201805-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201806-citibike-tripdata.csv.zip
62 : handling file 201806-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201807-citibike-tripdata.csv.zip
63 : handling file 201807-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201808-citibike-tripdata.csv.zip
64 : handling file 201808-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201809-citibike-tripdata.csv.zip
65 : handling file 201809-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201810-citibike-tripdata.csv.zip
66 : handling file 201810-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201811-citibike-tripdata.csv.zip
67 : handling file 201811-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201812-citibike-tripdata.csv.zip
68 : handling file 201812-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201901-citibike-tripdata.csv.zip
69 : handling file 201901-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201902-citibike-tripdata.csv.zip
70 : handling file 201902-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201903-citibike-tripdata.csv.zip
71 : handling file 201903-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201904-citibike-tripdata.csv.zip
72 : handling file 201904-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201905-citibike-tripdata.csv.zip
73 : handling file 201905-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201906-citibike-tripdata.csv.zip
74 : handling file 201906-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201907-citibike-tripdata.csv.zip
75 : handling file 201907-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201908-citibike-tripdata.csv.zip
76 : handling file 201908-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
201909-citibike-tripdata.csv.zip
77 : handling file 201909-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201910-citibike-tripdata.csv.zip
78 : handling file 201910-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201911-citibike-tripdata.csv.zip
79 : handling file 201911-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


201912-citibike-tripdata.csv.zip
80 : handling file 201912-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202001-citibike-tripdata.csv.zip
81 : handling file 202001-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202002-citibike-tripdata.csv.zip
82 : handling file 202002-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202003-citibike-tripdata.csv.zip
83 : handling file 202003-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202004-citibike-tripdata.csv.zip
84 : handling file 202004-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202005-citibike-tripdata.csv.zip
85 : handling file 202005-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202006-citibike-tripdata.csv.zip
86 : handling file 202006-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202007-citibike-tripdata.csv.zip
87 : handling file 202007-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202008-citibike-tripdata.csv.zip
88 : handling file 202008-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202009-citibike-tripdata.csv.zip
89 : handling file 202009-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202010-citibike-tripdata.csv.zip
90 : handling file 202010-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202011-citibike-tripdata.csv.zip
91 : handling file 202011-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202012-citibike-tripdata.csv.zip
92 : handling file 202012-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202101-citibike-tripdata.csv.zip
93 : handling file 202101-citibike-tripdata.csv.zip


  df.columns = df.columns.str.replace('[\ ]', '')


202102-citibike-tripdata.csv.zip
94 : handling file 202102-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


202103-citibike-tripdata.csv.zip
95 : handling file 202103-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


202104-citibike-tripdata.csv.zip
96 : handling file 202104-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202105-citibike-tripdata.csv.zip
97 : handling file 202105-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202106-citibike-tripdata.csv.zip
98 : handling file 202106-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202107-citibike-tripdata.csv.zip
99 : handling file 202107-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202108-citibike-tripdata.csv.zip
100 : handling file 202108-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202109-citibike-tripdata.csv.zip
101 : handling file 202109-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202110-citibike-tripdata.csv.zip
102 : handling file 202110-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202111-citibike-tripdata.csv.zip
103 : handling file 202111-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202112-citibike-tripdata.csv.zip
104 : handling file 202112-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


202201-citibike-tripdata.csv.zip
105 : handling file 202201-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


202202-citibike-tripdata.csv.zip
106 : handling file 202202-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202203-citibike-tripdata.csv.zip
107 : handling file 202203-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


single positional indexer is out-of-bounds
202204-citibike-tripdata.csv.zip
108 : handling file 202204-citibike-tripdata.csv.zip


  df = pd.read_csv(zipfile.open(filename))
  df.columns = df.columns.str.replace('[\ ]', '')


stations.csv
stations.json


Unnamed: 0,id,name,lat,lon,first,last,rides
0,444,Broadway & W 24 St,40.742354,-73.989151,2013-06-01,2017-10-31,285239
1,406,Hicks St & Montague St,40.695128,-73.995951,2013-06-01,2021-01-31,129149
2,475,E 15 St & Irving Pl,40.735243,-73.987586,2013-06-01,2017-02-28,127196
3,2008,Little West St & 1 Pl,40.705693,-74.016777,2013-06-01,2021-01-31,331256
4,485,W 37 St & 5 Ave,40.75038,-73.98339,2013-06-01,2021-01-31,246676


In [32]:
# Transform the stations_df

# Remove dummy stations (there are test stations in the data)
# Remove stations with less than RIDES_COUNT_THRESHOLD rides
bad_stations_df = stations_df[stations_df[RIDES] < RIDES_COUNT_THRESHOLD]
print('removing %d bad stations that each have less than %d rides from stations data' % (bad_stations_df.shape[0], RIDES_COUNT_THRESHOLD))
stations_df = stations_df[stations_df[RIDES] >= RIDES_COUNT_THRESHOLD]

removing 141 bad stations that each have less than 100 rides from stations data


In [33]:
bad_stations_df.head(10)

Unnamed: 0,id,name,lat,lon,first,last,rides
334,3000,MLSWKiosk,40.755467,-73.986536,2013-06-06,2013-06-06,2
499,3252,Bike The Branches - Red Hook Branch,40.67535,-74.01002,2016-05-07,2016-05-07,20
512,3250,NYCBS Depot - PIT,40.71691,-73.983838,2016-07-11,2019-03-31,48
513,3036,8D OPS 01,40.684869,-74.02545,2016-07-12,2018-07-31,28
514,3040,SSP Tech Workshop,40.646678,-74.016263,2016-07-18,2019-05-31,49
516,3266,Kiosk in a box Deployment,40.708611,-73.928504,2016-07-28,2016-09-30,7
552,3385,2 Ave & E 105 St,40.789817,-73.942961,2016-08-12,2016-08-12,1
667,3446,NYCBS Depot - STY - Valet Scan,0.0,0.0,2017-01-30,2017-01-30,2
691,3470,Gowanus Tech Station,40.669802,-73.994905,2017-06-26,2017-08-28,9
692,3248,LPI Facility,0.0,0.0,2017-06-27,2017-06-27,2


In [34]:
"""For the boston hubway/blue bikes data there will be duplicates because
when management changed from hubway to Bluebikes, the data fromat did too
This includes the station id/numbers and names AND lat/lon!
Task: deduplicate stations

Idea to understand data: sort the stations so the potential duplicates are next to each other
when merging/deduping data make sure to keep the earliest first and the latest last.

approach to deduplicating stations:
- normalize names and add new temporary column with normalized name
- get list of unique normalized names
- for each name:
    make a df for that name, sorted by [first, last]
    update main df to replace entries with that name with:
        first first
        last last
        last name
        rides as sum of rides
    sort main df by [name, first] and drop duplicates (duplicates on normalized name)
    remove normalized name column
"""

import re

NORMALIZED_NAME = 'normalized_name'

def normalized_station_name(name):
    normalized_name = name.lower()
    normalized_name = normalized_name.replace("former",  "").replace(" ", "")
    normalized_name = re.sub(r'[^a-z0-9]','', normalized_name)
    return normalized_name

if CITY == 'boston':
    stations_df[NORMALIZED_NAME] = stations_df[NAME].apply(normalized_station_name)
    normalized_names = stations_df[NORMALIZED_NAME]
    print(normalized_names.shape[0], ' names')
    unique_normalized_names = stations_df[NORMALIZED_NAME].unique()
    print(unique_normalized_names.shape[0], ' unique normalized names') #, unique_normalized_names)


    n = 0
    for normalized_name in unique_normalized_names:
        print(n, 'handling name', normalized_name)
        n+=1
        name_df = stations_df[stations_df[NORMALIZED_NAME] == normalized_name]
        name_df.sort_values(by=[FIRST, LAST], inplace=True)
        first = name_df[FIRST].iloc[0]
        last = name_df[LAST].iloc[-1]
        name = name_df[NAME].iloc[-1]
        rides = name_df[RIDES].sum()
        update_condition = (stations_df[NORMALIZED_NAME] == normalized_name)
        stations_df.loc[update_condition, [FIRST, LAST, NAME, RIDES]] = first, last, name, rides

    stations_dropped_duplicates_df = stations_df.drop_duplicates(subset=[NORMALIZED_NAME])
    print('dropped %s rows based on duplicate names' % (int(stations_df.shape[0]) - int(stations_dropped_duplicates_df.shape[0])))
    stations_dropped_duplicates_df.drop(labels=[NORMALIZED_NAME], axis=1, inplace=True)
    
    stations_df = stations_dropped_duplicates_df


In [35]:
stations_df = stations_dropped_duplicates_df
stations_df.head(30)

NameError: name 'stations_dropped_duplicates_df' is not defined

In [36]:
# Save the data to CSV
save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)
print('wrote data to ', save_to_csvfilename)

wrote data to  ../data/nyc-bike/stations.csv


In [37]:
# Save the data to JSON that will be used in web app
import json

stations = []
for index, row in stations_df.iterrows():
    # Transform the date
    date = row[5]
    
    stations.append({
        ID: str(row[ID]),
        NAME: row[NAME],
        LAT: row[LAT],
        LON: row[LON],
        FIRST: transform_date(row[FIRST]),
        LAST: transform_date(row[LAST]),
    })

json = json.dumps(stations)

save_to_jsonfilename = directory + 'stations.json'
with open(save_to_jsonfilename, 'w') as f:
    f.write(json)
print("Data written to stations.json")

Data written to stations.json


In [38]:
stations_df.head()

Unnamed: 0,id,name,lat,lon,first,last,rides
0,444,Broadway & W 24 St,40.742354,-73.989151,2013-06-01,2017-10-31,285239
1,406,Hicks St & Montague St,40.695128,-73.995951,2013-06-01,2021-01-31,129149
2,475,E 15 St & Irving Pl,40.735243,-73.987586,2013-06-01,2017-02-28,127196
3,2008,Little West St & 1 Pl,40.705693,-74.016777,2013-06-01,2021-01-31,331256
4,485,W 37 St & 5 Ave,40.75038,-73.98339,2013-06-01,2021-01-31,246676
