In [10]:
# Import packages
import numpy as np
import pandas as pd
import os
import datetime as dt
from datetime import timedelta

In [11]:
# Get filenames
def get_filenames_year(which_year, filename_pre):
    if (which_year == '2018'):
        filenames = [filename_pre + which_year + '_q1'+ file_format,
                     filename_pre + which_year + '_q2'+ file_format,
                     filename_pre + which_year + '_q3'+ file_format]
    elif (which_year == '2016') | (which_year == '2017'): 
         filenames = [filename_pre + which_year + '_q1'+ file_format,
                      filename_pre + which_year + '_q2'+ file_format,
                      filename_pre + which_year + '_q3'+ file_format,
                      filename_pre + which_year + '_q4'+ file_format]
    elif (which_year == '2015'): 
         filenames = [filename_pre + which_year + '_q2'+ file_format,
                      filename_pre + which_year + '_q3'+ file_format,
                      filename_pre + which_year + '_q4'+ file_format]
    return filenames

def create_and_clean_df(which_year, filenames, column_names, min_duration, max_duration_sd):     
    df = pd.DataFrame(columns = column_names)
    for x in range(len(filenames)): 
        current_file = os.path.join(data_dir + '/' + filenames[x])
        temp = pd.read_csv(current_file)
        df = df.append(temp)
    
    # 1) Drop all trips for which there is no duration information
    df = df[pd.notnull(df['duration'])]

    # 2) Identify potentially anomalous trips (based on length) 
    max_duration = df["duration"].mean() + df["duration"].std()*max_duration_sd
    num_dropped_bottom = 100*(len(df[df['duration']<=min_duration]))/(len(df['duration']))
    num_dropped_top = 100*(len(df[df['duration']>=max_duration]))/(len(df['duration']))

    # 3) Drop outliers within a q(and print proportions)
    print('For year %s: ' %which_year)
    df = df[(df.duration < max_duration) & (df.duration > min_duration)]
    print("Percent of trips that are too short %0.1f." %num_dropped_bottom)
    print("Percent of trips that are too long %0.1f." %num_dropped_top)
    
    # 1) Drop all trips for which there is no duration information
    df = df[pd.notnull(df['start_station'])]

    # Drop all trips for which we don't have the station number. 
    # Identify unknown stations and drop them
    df['start_station_id'] = df['start_station'].astype(int)
    df['end_station_id'] = df['end_station'].astype(int)
    #df = df[(df.start_station_id > station_numbers[0]) & (df.start_station_id < station_numbers[1])]
    #df = df[(df.end_station_id > station_numbers[0]) & (df.end_station_id < station_numbers[1])]
    
    # Reformat the starttime, so that its rounder per hour
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['start_time'] = df['start_time'].dt.round("H")
    # Extract date and hour from the start date to end date range
    df['start_time_date'] = pd.to_datetime(df['start_time']).dt.to_period('D')
    df['start_time_hour'] = pd.DatetimeIndex(df['start_time']).hour
    return df

def manipulate_df(this_df, timeseries, which_station, which_year): 
    temp_df = this_df[this_df['start_station_id'] == which_station].reset_index()
    temp_df = temp_df.drop(columns = 'index')
    df = pd.DataFrame({'count':temp_df.groupby(['start_time']).size()}).reset_index()
    df = df.set_index('start_time')
    df = df.reindex(timeseries, fill_value = 0)
    # add date and time separately as columns
    # make a column out of an index
    df['timestamp'] = df.index
    df['start_time_date'] = pd.to_datetime(df['timestamp']).dt.to_period('D')
    df['start_time_hour'] = pd.DatetimeIndex(df['timestamp']).hour
    df = df.reset_index()
    df = df.drop(columns = 'index')    
    # save into dataframe
    filename = 'Station'+ str(int(which_station)) + '-' + which_year + file_format
    df.to_csv(filename)

def make_timeseries_old(timeseries, column_names): 
# for end time
# format a timeseries dataframe so we can join it with the trips
    first_col = column_names[0]
    second_col = column_names[1]
    t_df = pd.DataFrame(index=timeseries_format, columns=column_names).reset_index()
    column_names.insert(0, 'timestamp')
    t_df.columns = column_names
    t_df[first_col] = pd.to_datetime(t_df['timestamp']).dt.to_period('D')
    t_df[second_col] = pd.DatetimeIndex(t_df['timestamp']).hour
    # drop timestamp, we don't need it anymore
    t_df = t_df.drop(columns = 'timestamp')
    return t_df  

In [12]:
# Set directories and other file parameters
main_dir = os.getcwd() 
data_dir = main_dir + '/data'
filename_pre = 'indego_trips_'
file_format = '.csv'
year_range = ['2015','2016','2017','2018'] 
column_names = ['trip_id', 'duration', 'start_time', 'end_time','start_station','start_lat','start_lon', 
                    'end_station','end_lat','end_lon', 'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type']

which_year = '2017'
filenames = get_filenames_year(which_year, filename_pre)
station_numbers = [3000, 4000]
min_duration = 1 # 1 minute
max_duration_sd = 2 # 2 standard deviations 

In [13]:
df = create_and_clean_df(which_year, filenames, column_names, min_duration, max_duration_sd)

For year 2017: 
Percent of trips that are too short 1.3.
Percent of trips that are too long 1.2.


In [14]:
# make a timeseries array
first_day = df['start_time_date'].min().to_timestamp()
last_day = df['start_time_date'].max().to_timestamp()  + timedelta(hours=23)
timeseries_format =  pd.date_range(first_day, last_day, freq='H')

In [15]:
# Identify most valuable stations
station_df = df.groupby(['start_station'])['trip_id'].count() #identify unique trips and count them
how_many = 10
most_valuable_stations = station_df.sort_values(ascending = False).head(how_many).index.tolist()
print(how_many,"most valuable stations in rank-order are: ", most_valuable_stations)
# Save the set of stations if it's for 2018 year
if which_year == 2018: 
    import pickle
    f = open('most_valuable_stations.pckl', 'wb')
    pickle.dump(most_valuable_stations, f)
    f.close()

10 most valuable stations in rank-order are:  [3023, 3010, 3021, 3045, 3032, 3054, 3020, 3012, 3022, 3057]


In [16]:
for station in most_valuable_stations: 
    manipulate_df(df, timeseries_format, station, which_year)

In [17]:
this_df = df
which_station = 3010
timeseries = timeseries_format
temp_df = this_df[this_df['start_station_id'] == which_station].reset_index()


In [18]:
temp_df = temp_df.drop(columns = 'index')
df = pd.DataFrame({'count':temp_df.groupby(['start_time']).size()}).reset_index()
df = df.set_index('start_time')
df = df.reindex(timeseries, fill_value = 0)
    # add date and time separately as columns
    # make a column out of an index
df['timestamp'] = df.index
df['start_time_date'] = pd.to_datetime(df['timestamp']).dt.to_period('D')
df['start_time_hour'] = pd.DatetimeIndex(df['timestamp']).hour
df = df.reset_index()
df = df.drop(columns = 'index')    
# save into dataframe
filename = 'Station'+ str(int(which_station)) + '-' + which_year + file_format
df.to_csv(filename)