In [30]:
"""
Processes the raw bike trip data to get information on bike dock locations
and when the docks were placed in those locations.

Desired output columns:

id | first | last | name | lat | lon

where
- id is the station's id
- first is the earliest trip date for the station id
- last is the latest trip date for the station id (included in case docks are removed)
- name is the station's name
- lat and lon are the latitude and longitude of the station's location

"""

import os
from zipfile import ZipFile
import pandas as pd


def get_filepath(city):
    return '../data/' + city + '-bike/'

In [85]:

# input file column names for indexing data with
start_station_id = 'startstationid'
start_station_name = 'startstationname'
start_station_latitude = 'startstationlatitude'
start_station_longitude = 'startstationlongitude'
starttime = 'starttime'

# output file column names
ID = 'id'
NAME = 'name'
LAT = 'lat'
LON = 'lon'
FIRST = 'first'
LAST = 'last'


"""
make a dict like 
{"id": {"name": "", "lat": "", "lon": "", "first": "", "last": ""}}
where there is one entry for each id
and where the start time is always the earliest found

and then later transform it into a dict like

{'id': [id1, id2, id3], 'col_2': ['a', 'b', 'c', 'd']}

to then make into a dataframe and save as a CSV
"""



def stations_dict_to_df(stations_dict):
    new_dict = {
        ID: [],
        NAME: [],
        LAT: [],
        LON: [],
        FIRST: [],
        LAST: []
    }
    for station_id, station_dict in stations_dict.items():
        new_dict[ID].append(station_id)
        new_dict[NAME].append(station_dict[NAME])
        new_dict[LAT].append(station_dict[LAT])
        new_dict[LON].append(station_dict[LON])
        new_dict[FIRST].append(station_dict[FIRST])
        new_dict[LAST].append(station_dict[LAST])
    
    return pd.DataFrame.from_dict(new_dict)
    


stations_dict = dict()

directory = get_filepath('nyc')

for zipfilename in os.listdir(directory):
    if not zipfilename.endswith(".zip"):
        continue
        
    print('handling file', zipfilename)
    fullzipfilename = directory + zipfilename
    # Because someone dropped some gnarly mac osx files into their zips
    zipfile = ZipFile(fullzipfilename)
    csvfilename = [f.filename for f in zipfile.infolist() if f.filename.endswith('.csv')][0]
    
    stations_df = pd.read_csv(zipfile.open(csvfilename))
    # Because someone can't make data files with uniform column names
    stations_df.columns = map(str.lower, stations_df.columns)
    stations_df.columns = stations_df.columns.str.replace('[\ ]', '')

    unique_station_ids = stations_df[start_station_id].unique()
    for station_id in unique_station_ids:
        if math.isnan(station_id):
            print('found nan')
            continue
        station_df = stations_df[stations_df[start_station_id] == station_id]
        if station_id not in stations_dict:
            stations_dict[station_id] = {
                NAME: station_df[start_station_name].iloc[0], 
                LAT: station_df[start_station_latitude].iloc[0],
                LON: station_df[start_station_longitude].iloc[0], 
                FIRST: station_df[starttime].iloc[0], 
                LAST: station_df[starttime].iloc[0], 
            }
        station_df = station_df.sort_values(by=[starttime])
        if (station_df[starttime].iloc[0] < stations_dict[station_id]['first']):
            stations_dict[station_id]['first'] = stations_df[starttime].iloc[0]
        if (station_df[starttime].iloc[-1] > stations_dict[station_id]['last']):
            stations_dict[station_id]['last'] = stations_df[starttime].iloc[-1]


stations_df = stations_dict_to_df(stations_dict)
stations_df.head()

handling file 201309-citibike-tripdata.zip
handling file 201608-citibike-tripdata.zip
handling file 201807-citibike-tripdata.csv.zip
handling file 201708-citibike-tripdata.csv.zip
handling file 201905-citibike-tripdata.csv.zip
handling file 201510-citibike-tripdata.zip
handling file 201405-citibike-tripdata.zip
handling file 201812-citibike-tripdata.csv.zip
found nan
handling file 201506-citibike-tripdata.zip
handling file 201712-citibike-tripdata.csv.zip
handling file 201902-citibike-tripdata.csv.zip
found nan
handling file 201512-citibike-tripdata.zip
handling file 201411-citibike-tripdata.zip
handling file 201808-citibike-tripdata.csv.zip
found nan
handling file 201407-citibike-tripdata.zip
handling file 201504-citibike-tripdata.zip
handling file 201707-citibike-tripdata.csv.zip
handling file 201602-citibike-tripdata.zip
handling file 201702-citibike-tripdata.csv.zip
handling file 201612-citibike-tripdata.zip
handling file 201805-citibike-tripdata.csv.zip
handling file 201604-citibi

Unnamed: 0,id,name,lat,lon,first,last
0,254.0,W 11 St & 6 Ave,40.735324,-73.998004,1/1/2015 0:01,9/30/2015 23:59:57
1,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,1/1/2015 0:01,9/30/2015 23:59:57
2,352.0,W 56 St & 6 Ave,40.763406,-73.977225,1/1/2015 0:01,9/30/2015 23:59:57
3,490.0,8 Ave & W 33 St,40.751551,-73.993934,1/1/2015 0:01,9/30/2015 23:59:57
4,236.0,St Marks Pl & 2 Ave,40.728419,-73.98714,1/1/2015 0:01,9/30/2015 23:59:57


In [86]:
# Save the data to CSV

save_to_csvfilename = directory + 'stations.csv'
stations_df.to_csv(save_to_csvfilename)

AttributeError: module 'pandas' has no attribute 'to_csv'