Data inproc and cleaning script

In [53]:
import numpy as np
import pandas as pd
from zipfile import ZipFile
import os
import shutil
import pyarrow as pa
import pyarrow.parquet as pq
import datetime

Source data from Capital Bikeshare data repos: https://s3.amazonaws.com/capitalbikeshare-data/index.html

In [2]:
zips = os.listdir(".././rawdata")

In [3]:
for zipfile in zips:
    zipfile = ".././rawdata/" + zipfile
    with ZipFile(zipfile, "r") as zipObj:
        fns = zipObj.namelist()
        for fn in fns:
            if fn.endswith('.csv') or fn.endswith('tripdata'):
                zipObj.extract(fn, 'temp')

In [4]:
for fn in os.listdir("./temp"):
    if fn.endswith('tripdata'):
        fn = "./temp/" + fn
        os.rename(fn, fn + ".csv")

In [5]:
try:
    shutil.rmtree("./temp/__MACOSX", ignore_errors=False, onerror=None)
except OSError as e:
    print ("Error: %s - %s." % (e.filename, e.strerror))

In [6]:
temp_dir = "./temp"
os.path.join(temp_dir, os.listdir(temp_dir)[0])
temp_files = []
for fn in os.listdir(temp_dir):
    temp_files.append(os.path.join(temp_dir, fn))

In [7]:
df = pd.concat((pd.read_csv(f) for f in temp_files))

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8396576 entries, 0 to 251542
Data columns (total 23 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Duration              float64
 1   Start date            object 
 2   End date              object 
 3   Start station number  float64
 4   Start station         object 
 5   End station number    float64
 6   End station           object 
 7   Bike number           object 
 8   Member type           object 
 9   ride_id               object 
 10  rideable_type         object 
 11  started_at            object 
 12  ended_at              object 
 13  start_station_name    object 
 14  start_station_id      float64
 15  end_station_name      object 
 16  end_station_id        float64
 17  start_lat             float64
 18  start_lng             float64
 19  end_lat               float64
 20  end_lng               float64
 21  member_casual         object 
 22  is_equity             object 
dtypes: float

Format changes at some point, and results in half the data with null values for each format. We can split-apply-combine this into the correct format for analysis.

In [23]:
dfearly = df[['Start date', 'End date', 'Start station number', 'Start station', 'End station number', 'End station']]
dfearly = dfearly.dropna(axis=0)

In [24]:
dfearly.shape

(7478501, 6)

In [25]:
dflate = df[['started_at', 'ended_at', 'start_station_id', 'start_station_name', 'end_station_id', 'end_station_name']]
dflate = dflate.dropna(axis=0)

In [26]:
dflate.shape

(884941, 6)

In [28]:
dfearly.columns = dflate.columns
df_fixed = dfearly.append(dflate).dropna(axis=0).drop_duplicates(keep='last')
df_fixed.shape

(8343849, 6)

In [29]:
df_fixed.head()

Unnamed: 0,started_at,ended_at,start_station_id,start_station_name,end_station_id,end_station_name
0,2018-01-01 00:05:06,2018-01-01 00:14:18,31104.0,Adams Mill & Columbia Rd NW,31400.0,Georgia & New Hampshire Ave NW
1,2018-01-01 00:14:30,2018-01-01 00:35:53,31321.0,15th St & Constitution Ave NW,31321.0,15th St & Constitution Ave NW
2,2018-01-01 00:14:53,2018-01-01 00:35:58,31321.0,15th St & Constitution Ave NW,31321.0,15th St & Constitution Ave NW
3,2018-01-01 00:15:31,2018-01-01 00:25:09,31406.0,14th & Upshur St NW,31103.0,16th & Harvard St NW
4,2018-01-01 00:18:02,2018-01-01 00:24:15,31618.0,4th & East Capitol St NE,31619.0,Lincoln Park / 13th & East Capitol St NE


In [35]:
df_fixed.tail()

Unnamed: 0,started_at,ended_at,start_station_id,start_station_name,end_station_id,end_station_name
251537,2020-08-10 20:45:45,2020-08-10 20:51:40,199.0,10th & K St NW,630.0,7th & S St NW
251538,2020-08-26 22:16:20,2020-08-26 22:35:08,33.0,1st & N St SE,642.0,Edgewood Rec Center
251539,2020-08-07 18:43:03,2020-08-07 18:55:53,642.0,Edgewood Rec Center,137.0,3rd & H St NE
251541,2020-08-28 07:22:41,2020-08-28 07:25:28,21.0,4th & M St SW,548.0,Maine Ave & 9th St SW
251542,2020-08-01 22:36:48,2020-08-01 22:44:02,140.0,Lincoln Park / 13th & East Capitol St NE,591.0,Massachusetts Ave & 6th St NE


Write this file to parquet format for easier sharing / processing.

In [30]:
tripdata_table = pa.Table.from_pandas(df_fixed)

In [31]:
pq.write_table(tripdata_table, ".././data/tripdata.parquet")

Pull out only the duration data by day

In [41]:
duration = df_fixed[['started_at', 'ended_at']].copy()
duration.head()

Unnamed: 0,started_at,ended_at
0,2018-01-01 00:05:06,2018-01-01 00:14:18
1,2018-01-01 00:14:30,2018-01-01 00:35:53
2,2018-01-01 00:14:53,2018-01-01 00:35:58
3,2018-01-01 00:15:31,2018-01-01 00:25:09
4,2018-01-01 00:18:02,2018-01-01 00:24:15


In [42]:
duration.tail()

Unnamed: 0,started_at,ended_at
251537,2020-08-10 20:45:45,2020-08-10 20:51:40
251538,2020-08-26 22:16:20,2020-08-26 22:35:08
251539,2020-08-07 18:43:03,2020-08-07 18:55:53
251541,2020-08-28 07:22:41,2020-08-28 07:25:28
251542,2020-08-01 22:36:48,2020-08-01 22:44:02


In [None]:
duration['trip_start'] = pd.to_datetime(duration['started_at'])
duration['trip_end'] = pd.to_datetime(duration['ended_at'])

In [47]:
duration['duration'] = (duration['trip_end'] - duration['trip_start'])

In [49]:
duration.drop(columns=['started_at', 'ended_at'], inplace=True)
duration.head()

Unnamed: 0,trip_start,trip_end,duration
0,2018-01-01 00:05:06,2018-01-01 00:14:18,00:09:12
1,2018-01-01 00:14:30,2018-01-01 00:35:53,00:21:23
2,2018-01-01 00:14:53,2018-01-01 00:35:58,00:21:05
3,2018-01-01 00:15:31,2018-01-01 00:25:09,00:09:38
4,2018-01-01 00:18:02,2018-01-01 00:24:15,00:06:13


Save the duration data

In [58]:
duration['duration_seconds'] = duration['duration'].dt.total_seconds()

In [62]:
duration.drop(columns = ['duration'], inplace=True)
duration.head()

Unnamed: 0,trip_start,trip_end,duration_seconds
0,2018-01-01 00:05:06,2018-01-01 00:14:18,552.0
1,2018-01-01 00:14:30,2018-01-01 00:35:53,1283.0
2,2018-01-01 00:14:53,2018-01-01 00:35:58,1265.0
3,2018-01-01 00:15:31,2018-01-01 00:25:09,578.0
4,2018-01-01 00:18:02,2018-01-01 00:24:15,373.0


In [63]:
duration_table = pa.Table.from_pandas(duration)

In [64]:
pq.write_table(duration_table, ".././data/duration-data.parquet")

Clean up the temp files and directory

In [17]:
try:
    shutil.rmtree("./temp", ignore_errors=False, onerror=None)
except OSError as e:
    print ("Error: %s - %s." % (e.filename, e.strerror))