## Import packages & dependencies

In [1]:
#!usr/bin/env python3
import os
import shutil
from zipfile import ZipFile 
import pandas as pd
import numpy as np
import dateutil #https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
from datetime import datetime
import glob
import time
import gc  #garbage collection to free up memory

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')


In [2]:
#set some static parameters
debug_mode = 'y'
csv_header_ind = 'True'
cur_dir = os.getcwd()


## Load Data

In [3]:
for i in range(2013,2021):
    data_year = i
    print(data_year)

2013
2014
2015
2016
2017
2018
2019
2020


In [4]:
# for debugging using a single file
data_year = 9999

In [5]:
# print start timestamp 
execStartDateTime = datetime.now()
print(execStartDateTime)

2020-06-20 08:21:18.143756


In [6]:
raw_dir = os.path.join(cur_dir,'citibike_files','raw', str(data_year))

In [7]:
path = raw_dir

all_files = sorted(glob.glob(os.path.join(path, "*citibike*")))

all_df = []

#parse_dates=['starttime','stoptime'],

for f in all_files:
    try: 
        print('try:' + f)
        df = pd.read_csv(f, sep=',',header = 1,skiprows = 1,\
        names=['tripduration','starttime','stoptime','start station id','start station name'\
        ,'start station latitude','start station longitude','end station id','end station name'\
        ,'end station latitude','end station longitude','bikeid','usertype','birth year','gender'])
        print(len(df.index))
        df['a_file'] = f.split('/')[-1]    
        all_df.append(df)
        citibike_df = pd.concat(all_df, ignore_index=True, sort=True)
    except:
         print('except:' + f)

try:/Users/Werd/boot_camp/gitlib/tableau-citibike/citibike_files/raw/9999/201512-citibike-tripdata.zip
804123


## Pre-processing: Preview data & datatype inspection

In [8]:
list(citibike_df.columns) 

['a_file',
 'bikeid',
 'birth year',
 'end station id',
 'end station latitude',
 'end station longitude',
 'end station name',
 'gender',
 'start station id',
 'start station latitude',
 'start station longitude',
 'start station name',
 'starttime',
 'stoptime',
 'tripduration',
 'usertype']

In [9]:
citibike_df.dtypes

a_file                      object
bikeid                       int64
birth year                 float64
end station id               int64
end station latitude       float64
end station longitude      float64
end station name            object
gender                       int64
start station id             int64
start station latitude     float64
start station longitude    float64
start station name          object
starttime                   object
stoptime                    object
tripduration                 int64
usertype                    object
dtype: object

In [10]:
# set text columns as categories
for col in ['gender', 'usertype', 'start station name', 'end station name']:
    citibike_df[col] = citibike_df[col].astype('category')

In [11]:
# set datatypes for numeric columns
citibike_df['start station id'] = citibike_df['start station id'].astype(str).astype(float).astype(int)
citibike_df['start station latitude'] = citibike_df['start station latitude'].astype(float)
citibike_df['start station latitude'] = citibike_df['start station latitude'].round(decimals=3)
citibike_df['start station longitude'] = citibike_df['start station longitude'].astype(float)
citibike_df['start station longitude'] = citibike_df['start station longitude'].round(decimals=3)
citibike_df = citibike_df.dropna(subset=['end station id'])
citibike_df['end station id'] = citibike_df['end station id'].astype(str).astype(float).astype(int)
citibike_df['end station latitude'] = citibike_df['end station latitude'].astype(float)
citibike_df['end station latitude'] = citibike_df['end station latitude'].round(decimals=3)
citibike_df['end station longitude'] = citibike_df['end station longitude'].astype(float)
citibike_df['end station longitude'] = citibike_df['end station longitude'].round(decimals=3)

In [12]:
citibike_df.dtypes

a_file                       object
bikeid                        int64
birth year                  float64
end station id                int64
end station latitude        float64
end station longitude       float64
end station name           category
gender                     category
start station id              int64
start station latitude      float64
start station longitude     float64
start station name         category
starttime                    object
stoptime                     object
tripduration                  int64
usertype                   category
dtype: object

In [13]:
citibike_df.head()

Unnamed: 0,a_file,bikeid,birth year,end station id,end station latitude,end station longitude,end station name,gender,start station id,start station latitude,start station longitude,start station name,starttime,stoptime,tripduration,usertype
0,201512-citibike-tripdata.zip,18797,1966.0,358,40.733,-74.007,Christopher St & Greenwich St,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 07:44:49,12/1/2015 08:02:33,1063,Subscriber
1,201512-citibike-tripdata.zip,14625,1985.0,505,40.749,-73.988,6 Ave & W 33 St,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:02:29,12/1/2015 08:20:24,1075,Subscriber
2,201512-citibike-tripdata.zip,21238,1968.0,525,40.756,-74.002,W 34 St & 11 Ave,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:06:37,12/1/2015 08:11:30,293,Subscriber
3,201512-citibike-tripdata.zip,19518,1960.0,484,40.755,-73.98,W 44 St & 5 Ave,2,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:07:57,12/1/2015 08:21:30,812,Subscriber
4,201512-citibike-tripdata.zip,22307,1980.0,520,40.76,-73.976,W 52 St & 5 Ave,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:08:53,12/1/2015 08:18:05,551,Subscriber


In [14]:
citibike_df['birth year'].value_counts()

1984.0    29441
1985.0    29271
1983.0    28169
1986.0    27495
1987.0    26811
          ...  
1926.0       13
1918.0        8
1934.0        5
1917.0        2
1907.0        2
Name: birth year, Length: 77, dtype: int64

In [15]:
# Using try block here since data files were not consistent over time
try:
    if pd.api.types.is_string_dtype:
        citibike_df['birth year'] = citibike_df['birth year'].replace({"\\N":2020})
except:
    print("skip")

skip


In [16]:
citibike_df['birth year'].fillna(2020,inplace=True)

In [17]:
# Now that all fields are prepped drop nans in dataframe.  This is slow.
citibike_df.dropna(inplace=True)

In [18]:
# Set birth year datatype the nans dropped
citibike_df['birth year'] = citibike_df['birth year'].astype(str).astype(float).astype(int)

In [19]:
citibike_df.isnull().sum(axis=0)

a_file                     0
bikeid                     0
birth year                 0
end station id             0
end station latitude       0
end station longitude      0
end station name           0
gender                     0
start station id           0
start station latitude     0
start station longitude    0
start station name         0
starttime                  0
stoptime                   0
tripduration               0
usertype                   0
dtype: int64

In [20]:
#stamp the output files yearmonth to track the source of the data
citibike_df['yearmonth'] =  citibike_df['a_file'].str[:6].astype(int)

## Analyze by date and starthour

In [21]:
citibike_df.dtypes

a_file                       object
bikeid                        int64
birth year                    int64
end station id                int64
end station latitude        float64
end station longitude       float64
end station name           category
gender                     category
start station id              int64
start station latitude      float64
start station longitude     float64
start station name         category
starttime                    object
stoptime                     object
tripduration                  int64
usertype                   category
yearmonth                     int64
dtype: object

In [22]:
citibike_df[['begindate','begintime']] = citibike_df.starttime.str.split(expand=True) 

In [23]:
# Possible optimzation:  https://stackoverflow.com/questions/50744369/how-to-speed-up-pandas-string-function
# %timeit [x.split('~', 1)[0] for x in df['facility']]
# def splittime(x):
#     test = [x.split(' ', 1)[0] for x in citibike_df['starttime']]
#     return x.map(test)
# citibike_df['test2'] = splittime(citibike_df['starttime'])
# TypeError: list indices must be integers or slices, not str   

In [24]:
# https://github.com/pandas-dev/pandas/issues/11665
def lookup(s):
    """
    This is an extremely fast approach to datetime parsing.
    For large data, the same dates are often repeated. Rather than
    re-parse these, we store all unique dates, parse them, and
    use a lookup to convert all dates.
    """
    dates = {date:pd.to_datetime(date) for date in s.unique()}
    return s.map(dates)

In [25]:
citibike_df['startdate'] = lookup(citibike_df['begindate'])

In [26]:
citibike_df.head()

Unnamed: 0,a_file,bikeid,birth year,end station id,end station latitude,end station longitude,end station name,gender,start station id,start station latitude,start station longitude,start station name,starttime,stoptime,tripduration,usertype,yearmonth,begindate,begintime,startdate
0,201512-citibike-tripdata.zip,18797,1966,358,40.733,-74.007,Christopher St & Greenwich St,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 07:44:49,12/1/2015 08:02:33,1063,Subscriber,201512,12/1/2015,07:44:49,2015-12-01
1,201512-citibike-tripdata.zip,14625,1985,505,40.749,-73.988,6 Ave & W 33 St,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:02:29,12/1/2015 08:20:24,1075,Subscriber,201512,12/1/2015,08:02:29,2015-12-01
2,201512-citibike-tripdata.zip,21238,1968,525,40.756,-74.002,W 34 St & 11 Ave,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:06:37,12/1/2015 08:11:30,293,Subscriber,201512,12/1/2015,08:06:37,2015-12-01
3,201512-citibike-tripdata.zip,19518,1960,484,40.755,-73.98,W 44 St & 5 Ave,2,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:07:57,12/1/2015 08:21:30,812,Subscriber,201512,12/1/2015,08:07:57,2015-12-01
4,201512-citibike-tripdata.zip,22307,1980,520,40.76,-73.976,W 52 St & 5 Ave,1,72,40.767,-73.994,W 52 St & 11 Ave,12/1/2015 08:08:53,12/1/2015 08:18:05,551,Subscriber,201512,12/1/2015,08:08:53,2015-12-01


In [27]:
citibike_df['starthour'] = citibike_df['begintime'].str.slice(0, 2)

In [28]:
daily_df = citibike_df.groupby(['startdate']).tripduration.agg(['count','sum']).reset_index().set_index(['startdate'])
daily_df.sort_index(axis = 0) 
daily_df

Unnamed: 0_level_0,count,sum
startdate,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-01,18397,14218256
2015-12-02,23782,19273600
2015-12-03,33917,26762988
2015-12-04,34737,30333230
2015-12-05,26358,33569409
2015-12-06,24943,29453240
2015-12-07,34139,28896030
2015-12-08,35614,29257650
2015-12-09,35306,30789577
2015-12-10,39099,34549244


In [29]:
citibike_daily_bike_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_trips_daily.csv')

In [30]:
if debug_mode == 'n':
    if not os.path.isfile(citibike_daily_bike_csv):
       daily_df.to_csv(citibike_daily_bike_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       daily_df.to_csv(citibike_daily_bike_csv, mode='a', header=False, index=False)

In [31]:
# Extend analysis tostart hour
hourly_df = citibike_df.groupby(['startdate','starthour']).tripduration.agg(['count','sum']).reset_index()
hourly_df.set_index('startdate', inplace = True) 
hourly_df.sort_index(axis = 0) 
hourly_df.head()

Unnamed: 0_level_0,starthour,count,sum
startdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-12-01,0,136,109974
2015-12-01,1,54,52541
2015-12-01,2,27,42981
2015-12-01,3,15,14830
2015-12-01,4,19,9406


In [32]:
citibike_hourly_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_trips_hourly.csv')

In [33]:
if debug_mode == 'n':
    if not os.path.isfile(citibike_hourly_csv):
       hourly_df.to_csv(citibike_hourly_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       hourly_df.to_csv(citibike_hourly_csv, mode='a', header=False, index=False)

## Analyze customer data

In [34]:
citibike_df['gender'].value_counts()

1    568229
2    166368
0     69526
Name: gender, dtype: int64

In [35]:
citibike_df['birth year'].value_counts()

2020    67877
1984    29441
1985    29271
1983    28169
1986    27495
        ...  
1926       13
1918        8
1934        5
1917        2
1907        2
Name: birth year, Length: 78, dtype: int64

In [36]:
currentYear = datetime.now().year

In [37]:
citibike_df['rider age'] = currentYear - citibike_df['birth year']
citibike_df

Unnamed: 0,a_file,bikeid,birth year,end station id,end station latitude,end station longitude,end station name,gender,start station id,start station latitude,...,starttime,stoptime,tripduration,usertype,yearmonth,begindate,begintime,startdate,starthour,rider age
0,201512-citibike-tripdata.zip,18797,1966,358,40.733,-74.007,Christopher St & Greenwich St,1,72,40.767,...,12/1/2015 07:44:49,12/1/2015 08:02:33,1063,Subscriber,201512,12/1/2015,07:44:49,2015-12-01,07,54
1,201512-citibike-tripdata.zip,14625,1985,505,40.749,-73.988,6 Ave & W 33 St,1,72,40.767,...,12/1/2015 08:02:29,12/1/2015 08:20:24,1075,Subscriber,201512,12/1/2015,08:02:29,2015-12-01,08,35
2,201512-citibike-tripdata.zip,21238,1968,525,40.756,-74.002,W 34 St & 11 Ave,1,72,40.767,...,12/1/2015 08:06:37,12/1/2015 08:11:30,293,Subscriber,201512,12/1/2015,08:06:37,2015-12-01,08,52
3,201512-citibike-tripdata.zip,19518,1960,484,40.755,-73.980,W 44 St & 5 Ave,2,72,40.767,...,12/1/2015 08:07:57,12/1/2015 08:21:30,812,Subscriber,201512,12/1/2015,08:07:57,2015-12-01,08,60
4,201512-citibike-tripdata.zip,22307,1980,520,40.760,-73.976,W 52 St & 5 Ave,1,72,40.767,...,12/1/2015 08:08:53,12/1/2015 08:18:05,551,Subscriber,201512,12/1/2015,08:08:53,2015-12-01,08,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804118,201512-citibike-tripdata.zip,22282,1957,261,40.695,-73.984,Johnson St & Gold St,1,3242,40.691,...,12/31/2015 16:50:17,12/31/2015 16:58:46,509,Subscriber,201512,12/31/2015,16:50:17,2015-12-31,16,63
804119,201512-citibike-tripdata.zip,22366,1978,258,40.689,-73.969,DeKalb Ave & Vanderbilt Ave,1,3242,40.691,...,12/31/2015 17:17:52,12/31/2015 17:27:43,591,Subscriber,201512,12/31/2015,17:17:52,2015-12-31,17,42
804120,201512-citibike-tripdata.zip,20663,1959,157,40.691,-73.996,Henry St & Atlantic Ave,1,3242,40.691,...,12/31/2015 17:56:22,12/31/2015 17:59:39,197,Subscriber,201512,12/31/2015,17:56:22,2015-12-31,17,61
804121,201512-citibike-tripdata.zip,19782,1987,467,40.683,-73.979,Dean St & 4 Ave,1,3242,40.691,...,12/31/2015 21:20:38,12/31/2015 21:26:50,372,Subscriber,201512,12/31/2015,21:20:38,2015-12-31,21,33


In [38]:
bins = [-1,1,18,25,45,65,100,1000]
citibike_df['age bracket'] = pd.cut(citibike_df['rider age'],bins)
citibike_df

Unnamed: 0,a_file,bikeid,birth year,end station id,end station latitude,end station longitude,end station name,gender,start station id,start station latitude,...,stoptime,tripduration,usertype,yearmonth,begindate,begintime,startdate,starthour,rider age,age bracket
0,201512-citibike-tripdata.zip,18797,1966,358,40.733,-74.007,Christopher St & Greenwich St,1,72,40.767,...,12/1/2015 08:02:33,1063,Subscriber,201512,12/1/2015,07:44:49,2015-12-01,07,54,"(45, 65]"
1,201512-citibike-tripdata.zip,14625,1985,505,40.749,-73.988,6 Ave & W 33 St,1,72,40.767,...,12/1/2015 08:20:24,1075,Subscriber,201512,12/1/2015,08:02:29,2015-12-01,08,35,"(25, 45]"
2,201512-citibike-tripdata.zip,21238,1968,525,40.756,-74.002,W 34 St & 11 Ave,1,72,40.767,...,12/1/2015 08:11:30,293,Subscriber,201512,12/1/2015,08:06:37,2015-12-01,08,52,"(45, 65]"
3,201512-citibike-tripdata.zip,19518,1960,484,40.755,-73.980,W 44 St & 5 Ave,2,72,40.767,...,12/1/2015 08:21:30,812,Subscriber,201512,12/1/2015,08:07:57,2015-12-01,08,60,"(45, 65]"
4,201512-citibike-tripdata.zip,22307,1980,520,40.760,-73.976,W 52 St & 5 Ave,1,72,40.767,...,12/1/2015 08:18:05,551,Subscriber,201512,12/1/2015,08:08:53,2015-12-01,08,40,"(25, 45]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804118,201512-citibike-tripdata.zip,22282,1957,261,40.695,-73.984,Johnson St & Gold St,1,3242,40.691,...,12/31/2015 16:58:46,509,Subscriber,201512,12/31/2015,16:50:17,2015-12-31,16,63,"(45, 65]"
804119,201512-citibike-tripdata.zip,22366,1978,258,40.689,-73.969,DeKalb Ave & Vanderbilt Ave,1,3242,40.691,...,12/31/2015 17:27:43,591,Subscriber,201512,12/31/2015,17:17:52,2015-12-31,17,42,"(25, 45]"
804120,201512-citibike-tripdata.zip,20663,1959,157,40.691,-73.996,Henry St & Atlantic Ave,1,3242,40.691,...,12/31/2015 17:59:39,197,Subscriber,201512,12/31/2015,17:56:22,2015-12-31,17,61,"(45, 65]"
804121,201512-citibike-tripdata.zip,19782,1987,467,40.683,-73.979,Dean St & 4 Ave,1,3242,40.691,...,12/31/2015 21:26:50,372,Subscriber,201512,12/31/2015,21:20:38,2015-12-31,21,33,"(25, 45]"


In [39]:
customer_df = citibike_df.groupby(['startdate','gender','age bracket','usertype']).tripduration.agg(['count']).reset_index()
customer_df

Unnamed: 0,startdate,gender,age bracket,usertype,count
0,2015-12-01,0,"(-1, 1]",Customer,441
1,2015-12-01,0,"(-1, 1]",Subscriber,0
2,2015-12-01,0,"(1, 18]",Customer,0
3,2015-12-01,0,"(1, 18]",Subscriber,0
4,2015-12-01,0,"(18, 25]",Customer,0
...,...,...,...,...,...
1297,2015-12-31,2,"(45, 65]",Subscriber,1313
1298,2015-12-31,2,"(65, 100]",Customer,0
1299,2015-12-31,2,"(65, 100]",Subscriber,185
1300,2015-12-31,2,"(100, 1000]",Customer,0


In [40]:
citibike_customer_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_customer.csv')

In [41]:
if debug_mode == 'n':
    #https://stackoverflow.com/questions/30991541/pandas-write-csv-append-vs-write
    if not os.path.isfile(citibike_customer_csv):
       customer_df.to_csv(citibike_customer_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       customer_df.to_csv(citibike_customer_csv, mode='a', header=False, index=False)


## Analyze bike stations

In [42]:
start_stations_df = citibike_df.drop_duplicates(subset=["start station id", "start station latitude","start station longitude","start station name"])
start_stations_df = start_stations_df[["start station id", "start station latitude","start station longitude","start station name"]]
start_stations_df = pd.DataFrame(start_stations_df)
start_stations_df.columns = ["station id", "station latitude","station longitude","station name"]
start_stations_df

Unnamed: 0,station id,station latitude,station longitude,station name
0,72,40.767,-73.994,W 52 St & 11 Ave
1646,79,40.719,-74.007,Franklin St & W Broadway
3225,82,40.711,-74.000,St James Pl & Pearl St
3938,83,40.684,-73.976,Atlantic Ave & Fort Greene Pl
4655,116,40.742,-74.001,W 17 St & 8 Ave
...,...,...,...,...
801403,3236,40.759,-73.994,PABT Valet
803500,3237,40.754,-73.943,21 St & 41 Ave
803622,3238,40.774,-73.954,E 80 St & 2 Ave
804095,3241,40.686,-73.945,Monroe St & Tompkins Ave


In [43]:
end_stations_df = citibike_df.drop_duplicates(subset=["end station id", "end station latitude","end station longitude","end station name"])
end_stations_df = end_stations_df[["end station id", "end station latitude","end station longitude","end station name"]]
end_stations_df = pd.DataFrame(end_stations_df)
end_stations_df.columns = ["station id", "station latitude","station longitude","station name"]
end_stations_df

Unnamed: 0,station id,station latitude,station longitude,station name
0,358,40.733,-74.007,Christopher St & Greenwich St
1,505,40.749,-73.988,6 Ave & W 33 St
2,525,40.756,-74.002,W 34 St & 11 Ave
3,484,40.755,-73.980,W 44 St & 5 Ave
4,520,40.760,-73.976,W 52 St & 5 Ave
...,...,...,...,...
127068,3017,40.751,-73.997,NYCBS Depot - FAR
151789,3128,40.751,-73.946,21 St & 43 Ave
152456,3059,40.693,-73.940,Pulaski St & Marcus Garvey Blvd
578241,3187,40.721,-74.038,Warren St


In [44]:
# distinct_stations_df = start_stations_df.append(end_stations_df)
# distinct_stations_df = distinct_stations_df.drop_duplicates(subset=["station id", "station latitude","station longitude","station name"])
# distinct_stations_df = distinct_stations_df.set_index('station id', inplace = True)
# #distinct_stations_df.sort_index(axis = 0) 
# distinct_stations_df

In [45]:
distinct_stations_df = pd.DataFrame(start_stations_df.append(end_stations_df))
distinct_stations_df

Unnamed: 0,station id,station latitude,station longitude,station name
0,72,40.767,-73.994,W 52 St & 11 Ave
1646,79,40.719,-74.007,Franklin St & W Broadway
3225,82,40.711,-74.000,St James Pl & Pearl St
3938,83,40.684,-73.976,Atlantic Ave & Fort Greene Pl
4655,116,40.742,-74.001,W 17 St & 8 Ave
...,...,...,...,...
127068,3017,40.751,-73.997,NYCBS Depot - FAR
151789,3128,40.751,-73.946,21 St & 43 Ave
152456,3059,40.693,-73.940,Pulaski St & Marcus Garvey Blvd
578241,3187,40.721,-74.038,Warren St


In [46]:
distinct_stations_df = distinct_stations_df.drop_duplicates(subset=["station id", "station latitude","station longitude","station name"])
distinct_stations_df

Unnamed: 0,station id,station latitude,station longitude,station name
0,72,40.767,-73.994,W 52 St & 11 Ave
1646,79,40.719,-74.007,Franklin St & W Broadway
3225,82,40.711,-74.000,St James Pl & Pearl St
3938,83,40.684,-73.976,Atlantic Ave & Fort Greene Pl
4655,116,40.742,-74.001,W 17 St & 8 Ave
...,...,...,...,...
58658,3219,40.729,-73.977,NYCBS Depot - STY
65355,255,40.647,-74.017,NYCBS Depot - SSP
127068,3017,40.751,-73.997,NYCBS Depot - FAR
578241,3187,40.721,-74.038,Warren St


In [48]:
distinct_stations_df.set_index('station id', inplace = True)
distinct_stations_df.head()

AttributeError: 'NoneType' object has no attribute 'set_index'

In [None]:
distinct_stations_df.sort_index(axis = 0) 
distinct_stations_df

In [None]:
citibike_distinct_station_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_distinct_station.csv')

In [None]:
if debug_mode == 'n':
    if not os.path.isfile(citibike_distinct_station_csv):
       start_stations_df.to_csv(citibike_distinct_station_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       start_stations_df.to_csv(citibike_distinct_station_csv, mode='a', header=False, index=False)

In [None]:
start_station_trips_df = citibike_df.groupby(['startdate','start station id']).tripduration.agg(['count']).reset_index()
start_station_trips_df = start_station_trips_df.set_index(['startdate'])

In [None]:
citibike_start_station_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_start_station.csv')

In [None]:
if debug_mode == 'n':
    if not os.path.isfile(citibike_start_station_csv):
       start_stations_df.to_csv(citibike_start_station_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       start_stations_df.to_csv(citibike_start_station_csv, mode='a', header=False, index=False)

## Analyze bike equipment

In [None]:
bike_equipment_df = citibike_df.groupby(['bikeid']).tripduration.agg(['count','sum']).reset_index()
bike_equipment_df = bike_equipment_df.set_index('bikeid')
bike_equipment_df = pd.DataFrame(bike_equipment_df)

In [None]:
bike_date_df = citibike_df.groupby(['bikeid']).startdate.agg(['min','max']).reset_index()
bike_date_df = bike_date_df.set_index(['bikeid'])
bike_date_df = pd.DataFrame(bike_date_df)

In [None]:
bike_merged_df = pd.merge(bike_date_df, bike_equipment_df, left_index=True, right_index=True)

In [None]:
citibike_bike_equipment_csv = os.path.join(cur_dir,'citibike_files','cleansed','citibike_bike_date.csv')

In [None]:
if debug_mode == 'n':
    if not os.path.isfile(citibike_bike_equipment_csv):
       bike_merged_df.to_csv(citibike_bike_equipment_csv, header='column_names', index=False)
    else: # else it exists so append without writing the header
       bike_merged_df.to_csv(citibike_bike_equipment_csv, mode='a', header=False, index=False)

## Cleanup memory for next run

In [None]:
del [[citibike_df,customer_df, distinct_stations_df,start_stations_df,end_stations_df]]
del [[bike_equipment_df, bike_date_df, bike_merged_df]]
gc.collect()
citibike_df = []
customer_df = []
distinct_stations_df = []
start_stations_df = []
end_stations_df = []
bike_equipment_df = []
bike_date_df = []
bike_merged_df = []

In [None]:
# print end 
print(data_year)
execEndDateTime = datetime.now()
print(execStartDateTime)
print(execEndDateTime)