In [1]:
# Import packages
import numpy as np
import pandas as pd
import os
import datetime as dt

In [2]:
# Set directories
main_dir = os.getcwd() 
data_dir = main_dir + '/raw_data'

# Set files to import
which_year = '2017'
if which_year == '2018':
    filenames = ['indego_trips_2018_q1', 'indego_trips_2018_q2',
                 'indego_trips_2018_q3']
elif which_year == '2017': 
    filenames = ['indego_trips_2017_q1', 'indego_trips_2017_q2',
             'indego_trips_2017_q3', 'indego_trips_2017_q4']
elif which_year == '2016': 
    filenames = ['indego_trips_2016_q1', 'indego_trips_2016_q2',
             'indego_trips_2016_q3', 'indego_trips_2016_q4']
column_names = ['trip_id', 'duration', 'start_time', 'end_time','start_station','start_lat','start_lon', 
                'end_station','end_lat','end_lon', 'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type']
file_format = '.csv'

In [3]:
# Concatinate all quarters for a given year into a single data frame
df = pd.DataFrame(columns = column_names)
for x in range(len(filenames)): 
    current_file = os.path.join(data_dir + '/' + filenames[x] + file_format)
    temp = pd.read_csv(current_file)
    df = df.append(temp)

In [4]:
# Clean up based on duration

# 1) Drop all trips for which there is no duration information
df = df[pd.notnull(df['duration'])]

# 2) Identify potentially anomalous trips (based on length) 
maxDuration = df["duration"].mean() + df["duration"].std()*2
minDuration  = 1 #one minute
numDroppedBottom = 100*(len(df[df['duration']<=minDuration]))/(len(df['duration']))
numDroppedTop = 100*(len(df[df['duration']>=maxDuration]))/(len(df['duration']))

# 3) Drop outliers (and print proportions)
df = df[(df.duration < maxDuration) & (df.duration > minDuration)]
print("Percent of trips that are too short %0.1f." %numDroppedBottom)
print("Percent of trips that are too long %0.1f." %numDroppedTop)

Percent of trips that are too short 1.3.
Percent of trips that are too long 1.2.


In [5]:
# Clean up based on start station missing data

# 1) Drop all trips for which there is no duration information
df = df[pd.notnull(df['start_station'])]

# Drop all trips for which we don't have the station number. 
# Identify unknown stations and drop them
df['start_station_id'] = df['start_station'].astype(int)
df = df[(df.start_station_id > 3000) & (df.start_station_id < 4000)]

In [6]:
# Reformat the starttime, so that its rounder per hour
df['start_time'] = pd.to_datetime(df['start_time'])
df['start_time'] = df['start_time'].dt.round("H")
# Extract date and hour from the start date to end date range
df['start_time_date'] = pd.to_datetime(df['start_time']).dt.to_period('D')
df['start_time_hour'] = pd.DatetimeIndex(df['start_time']).hour
#df['start_time'] = df['start_time'].dt.round("H")
first_day = df['start_time_date'].min().to_timestamp()
last_day = df['start_time_date'].max().to_timestamp()
timeseries =  pd.date_range(first_day, last_day, freq='H')

In [7]:
# format a timeseries dataframe so we can join it with the trips
timeseries_columns = ['start_time_date', 'start_time_hour']
timeseries_df = pd.DataFrame(index=timeseries, columns=timeseries_columns).reset_index()
timeseries_columns.insert(0, 'timestamp')
timeseries_df.columns = timeseries_columns
timeseries_df['start_time_date'] = pd.to_datetime(timeseries_df['timestamp']).dt.to_period('D')
timeseries_df['start_time_hour'] = pd.DatetimeIndex(timeseries_df['timestamp']).hour
# drop timestamp, we don't need it anymore
timeseries_df = timeseries_df.drop(columns = 'timestamp')

In [8]:
# Identify most valuable stations
station_df = df.groupby(['start_station'])['trip_id'].count() #identify unique trips and count them
how_many = 10
most_valuable_stations = station_df.sort_values(ascending = False).head(how_many).index.tolist()
print(how_many,"most valuable stations in rank-order are: ", most_valuable_stations)

10 most valuable stations in rank-order are:  [3023, 3010, 3021, 3045, 3032, 3054, 3020, 3012, 3022, 3057]


In [9]:
def manipulatedf(this_df, timeseries, which_station, which_year): 
    temp_df = this_df[this_df['start_station_id'] == which_station].reset_index()
    temp_df = temp_df.drop(columns = 'index')
    df = pd.DataFrame({'count':temp_df.groupby(['start_time']).size()}).reset_index()
    df = df.set_index('start_time')
    df = df.reindex(timeseries, fill_value = 0)
    # add date and time separately as columns
    # make a column out of an index
    df['timestamp'] = df.index
    df['start_time_date'] = pd.to_datetime(df['timestamp']).dt.to_period('D')
    df['start_time_hour'] = pd.DatetimeIndex(df['timestamp']).hour
    df = df.reset_index()
    df = df.drop(columns = 'index')    
    # save into dataframe
    filename = 'Station'+  str(which_station) + '-' + which_year + file_format
    df.to_csv(filename)

In [10]:
for station in most_valuable_stations: 
    manipulated_df = manipulatedf(df, timeseries, station, which_year)