<h2>Step 1. Importing and cleaning the data </h2>

In [None]:
# Import packages
import numpy as np
import pandas as pd
import os
import datetime as dt
from datetime import timedelta
import pickle

# Import for plotting
import matplotlib.pyplot as plt

# Import custom functions
import bb_clean_data

In [None]:
# This analysis is done year by year
# Setting up parameters
# Set directories and other file parameters
main_dir = os.getcwd() 
data_dir = main_dir + '/data'
filename_pre = 'indego_trips_'
file_format = '.csv'
year_range = ['2015','2016','2017','2018'] 
column_names = ['trip_id', 'duration', 'start_time', 'end_time','start_station','start_lat','start_lon', 
                    'end_station','end_lat','end_lon', 'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type']
which_year = '2015'
which_ride_end = 'end'
station_numbers = [3000, 4000] # exclude dummy stations
min_duration = 1 # 1 minute
max_duration_sd = 2 # 2 standard deviations 

# Parameters for identifying valuable stations
how_many = 11
inactive = 3023

# Option to look at the breakdown of passholder types: overall and per station
passholders_per_station = False
passholders = False 

######################

# Import and clean data
filenames = bb_clean_data.get_filenames_year(which_year, filename_pre, file_format)
df = bb_clean_data.create_and_clean_df(which_year, filenames, data_dir, column_names, 
                                       min_duration, max_duration_sd, station_numbers)

# Get the data and save them for analyzing the rides over time (rides aggregated by month)
df['month'] = pd.DatetimeIndex(df['start_time']).month
if which_year == '2018':
    df = df[df['month'] < 10] # before october
elif which_year == '2015': 
    df = df[df['month'] > 4] # after april
rides_per_month = df.groupby('month').count()['trip_id']
filename = 'rides_per_month'+ str(which_year) + '.pckl'
f = open(filename, 'wb')
pickle.dump(rides_per_month, f)
f.close()

# Calculate % passholders of different types across the network for a given year
if passholders:
    passholder_type_agg = df.groupby('passholder_type').count()['trip_id']/sum(df.groupby('passholder_type').count()['trip_id'])
    passholder_labels = passholder_type_agg.index.values.tolist()
    print(passholder_type_agg)
    fig1, ax1 = plt.subplots()
    ax1.pie(passholder_type_agg, labels = passholder_labels, autopct ='%1.1f%%',
        shadow = True, startangle = 90)
    ax1.axis('equal')  # Note from web: Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()    

# Make a timeseries array (this will be the same for start_time and end_time analysis)
first_day = df['start_time_date'].min().to_timestamp()
last_day = df['start_time_date'].max().to_timestamp() + timedelta(hours=23) # goes until the end of the day
timeseries_format =  pd.date_range(first_day, last_day, freq='H')

# Identify most valuable stations
station_df = df.groupby(['start_station'])['trip_id'].count() #identify unique trips and count them
most_valuable_stations = station_df.sort_values(ascending = False).head(how_many).index.tolist()
most_valuable_stations.remove(inactive)
print(how_many,"most valuable stations in rank-order are: ", most_valuable_stations)

# Save the set of stations if it's for 2018 year
if which_year == 2018: 
    import pickle
    f = open('most_valuable_stations.pckl', 'wb')
    pickle.dump(most_valuable_stations, f)
    f.close()    



In [None]:
# Load repeating stations 
# This list is created in a different script and contains info about stations active in 2016, 2017 AND 2018. 
station_list =  list(pickle.load(open("repeating_stations.pckl","rb")))
for station in station_list: 
    this_station_df = bb_clean_data.join_df_with_timeseries(df, timeseries_format, station, which_ride_end)  
    if which_ride_end == 'start':
        # Save dataframe into csv file for further analysis
        filename = 'intermediate_df/Station'+ str(int(station)) + '-' + which_year + file_format
    elif which_ride_end == 'end':
        # Save dataframe into csv file for further analysis
        filename = 'intermediate_df/EndStation'+ str(int(station)) + '-' + which_year + file_format
    this_station_df.to_csv(filename)
    del this_station_df 
    
    # Option to look at the breakdown of passholders per station
    if passholders_per_station:
        st = df[df['start_station_id'] == station] 
        print(station)
        st_passholder_type_agg = st.groupby('passholder_type').count()['trip_id']/sum(st.groupby('passholder_type').count()['trip_id'])
        st_passholder_type_agg.round(decimals = 2)         

In [None]:
stations_df = pd.read_csv('data/indego-stations-2019-01-04.csv').reset_index()
stations_dict = {}
for station in most_valuable_stations:
         #print(stations_df[stations_df['Station ID'] == station]['Station Name']) 
         stations_dict[station] = stations_df[stations_df['Station ID'] == station]['Station Name'].item()
pickle_it = True
if pickle_it:
    import pickle
    f = open('most_valuable_stations_dict.pckl', 'wb')
    pickle.dump(stations_dict, f)
    f.close()