In [5]:
# Import packages
import numpy as np
import pandas as pd
import os
import datetime as dt

In [11]:
# Set directories and other file parameters
main_dir = os.getcwd() 
data_dir = main_dir + '/data'
file_format = '.csv'
year_range = ['2015','2016','2017','2018'] 
column_names = ['trip_id', 'duration', 'start_time', 'end_time','start_station','start_lat','start_lon', 
                    'end_station','end_lat','end_lon', 'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type']
# Get filenames
def get_filenames_year(which_year):
    if which_year == '2018':
        filenames = ['indego_trips_2018_q1'+ file_format, 'indego_trips_2018_q2'+ file_format,
                 'indego_trips_2018_q3'+ file_format]
    elif which_year == '2017': 
        filenames = ['indego_trips_2017_q1'+ file_format, 'indego_trips_2017_q2'+ file_format,
                 'indego_trips_2017_q3'+ file_format, 'indego_trips_2017_q4'+ file_format]
    elif which_year == '2016': 
        filenames = ['indego_trips_2016_q1'+ file_format, 'indego_trips_2016_q2'+ file_format,
                 'indego_trips_2016_q3'+ file_format, 'indego_trips_2016_q4'+ file_format]
    elif which_year == '2015': 
        filenames = ['indego_trips_2015_q2'+ file_format,
                 'indego_trips_2015_q3'+ file_format, 'indego_trips_2015_q4'+ file_format]
    return filenames


In [None]:
# 1) Loop through all years
# 2) Concatinate all quarters for a given year into a single data frame
# 3) Clean up: drop rides too short or long. 


['indego_trips_2015_q2.csv',
 'indego_trips_2015_q3.csv',
 'indego_trips_2015_q4.csv']

In [10]:
filenames = get_filenames_year('2015')
def create_and_clean_df(which_year, filenames)    
df = pd.DataFrame(columns = column_names)
for x in range(len(filenames)): 
    current_file = os.path.join(data_dir + '/' + filenames[x])
    temp = pd.read_csv(current_file)
    df = df.append(temp)

    # 1) Drop all trips for which there is no duration information
    df = df[pd.notnull(df['duration'])]

    # 2) Identify potentially anomalous trips (based on length) 
    maxDuration = df["duration"].mean() + df["duration"].std()*2
    minDuration  = 1 #one minute
    numDroppedBottom = 100*(len(df[df['duration']<=minDuration]))/(len(df['duration']))
    numDroppedTop = 100*(len(df[df['duration']>=maxDuration]))/(len(df['duration']))

    # 3) Drop outliers within a q(and print proportions)
    df = df[(df.duration < maxDuration) & (df.duration > minDuration)]
    print("Percent of trips that are too short %0.1f." %numDroppedBottom)
    print("Percent of trips that are too long %0.1f." %numDroppedTop)    

Percent of trips that are too short 1.6.
Percent of trips that are too long 1.2.
Percent of trips that are too short 1.1.
Percent of trips that are too long 1.2.
Percent of trips that are too short 0.9.
Percent of trips that are too long 1.0.


In [None]:
# Clean up based on start station missing data

# 1) Drop all trips for which there is no duration information
df = df[pd.notnull(df['start_station'])]

# Drop all trips for which we don't have the station number. 
# Identify unknown stations and drop them
df['start_station_id'] = df['start_station'].astype(int)
df['end_station_id'] = df['end_station'].astype(int)

df = df[(df.start_station_id > 3000) & (df.start_station_id < 4000)]
df = df[(df.end_station_id > 3000) & (df.end_station_id < 4000)]

In [None]:
# Reformat the starttime, so that its rounder per hour
df['start_time'] = pd.to_datetime(df['start_time'])
df['start_time'] = df['start_time'].dt.round("H")
# Extract date and hour from the start date to end date range
df['start_time_date'] = pd.to_datetime(df['start_time']).dt.to_period('D')
df['start_time_hour'] = pd.DatetimeIndex(df['start_time']).hour

# Reformat the endtime, so that its rounder per hour
df['end_time'] = pd.to_datetime(df['end_time'])
df['end_time'] = df['end_time'].dt.round("H")
# Extract date and hour from the start date to end date range
df['end_time_date'] = pd.to_datetime(df['end_time']).dt.to_period('D')
df['end_time_hour'] = pd.DatetimeIndex(df['end_time']).hour

first_day = df['start_time_date'].min().to_timestamp()
last_day = df['start_time_date'].max().to_timestamp()
timeseries_format =  pd.date_range(first_day, last_day, freq='H')

In [None]:
# format a timeseries dataframe so we can join it with the trips
timeseries_columns = ['start_time_date', 'start_time_hour']
timeseries_df = pd.DataFrame(index=timeseries_format, columns=timeseries_columns).reset_index()
timeseries_columns.insert(0, 'timestamp')
timeseries_df.columns = timeseries_columns
timeseries_df['start_time_date'] = pd.to_datetime(timeseries_df['timestamp']).dt.to_period('D')
timeseries_df['start_time_hour'] = pd.DatetimeIndex(timeseries_df['timestamp']).hour
# drop timestamp, we don't need it anymore
timeseries_df = timeseries_df.drop(columns = 'timestamp')


In [None]:
def make_timeseries(timeseries, column_names): 
# for end time
# format a timeseries dataframe so we can join it with the trips
    first_col = column_names[0]
    second_col = column_names[1]
    t_df = pd.DataFrame(index=timeseries_format, columns=column_names).reset_index()
    column_names.insert(0, 'timestamp')
    t_df.columns = column_names
    t_df[first_col] = pd.to_datetime(t_df['timestamp']).dt.to_period('D')
    t_df[second_col] = pd.DatetimeIndex(t_df['timestamp']).hour
    # drop timestamp, we don't need it anymore
    t_df = t_df.drop(columns = 'timestamp')
    return t_df           

In [None]:
end_columns = ['end_time_date', 'end_time_hour']
timeseries_end_df = make_timeseries(timeseries_format, end_columns)  

In [None]:
# Identify most valuable stations
station_df = df.groupby(['start_station'])['trip_id'].count() #identify unique trips and count them
how_many = 10
most_valuable_stations = station_df.sort_values(ascending = False).head(how_many).index.tolist()
print(how_many,"most valuable stations in rank-order are: ", most_valuable_stations)

station_end_df = df.groupby(['end_station'])['trip_id'].count() #identify unique trips and count them

In [None]:
def manipulatedf(this_df, timeseries, which_station, which_year): 
    temp_df = this_df[this_df['start_station_id'] == which_station].reset_index()
    temp_df = temp_df.drop(columns = 'index')
    df = pd.DataFrame({'count':temp_df.groupby(['start_time']).size()}).reset_index()
    df = df.set_index('start_time')
    df = df.reindex(timeseries, fill_value = 0)
    # add date and time separately as columns
    # make a column out of an index
    df['timestamp'] = df.index
    df['start_time_date'] = pd.to_datetime(df['timestamp']).dt.to_period('D')
    df['start_time_hour'] = pd.DatetimeIndex(df['timestamp']).hour
    df = df.reset_index()
    df = df.drop(columns = 'index')    
    # save into dataframe
    filename = 'Station'+ str(int(which_station)) + '-' + which_year + file_format
    df.to_csv(filename)

In [None]:
for station in most_valuable_stations: 
    manipulatedf(df, timeseries_format, station, which_year)

In [None]:
if which_year == 2018: 
    import pickle
    f = open('most_valuable_stations.pckl', 'wb')
    pickle.dump(most_valuable_stations, f)
    f.close()