# Loading and Cleaning the Data

## Step 1: Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt 
import pickle

## Step 2: Reading in the Turnstile Data

In [32]:
# Define the weeks to retrieve from the MTA database
def list_weeks(list_of_dates):
    weeks = []
    for date in list_of_dates:
        week_string = date.strftime('%y%m%d')
        weeks.append(week_string)
    return weeks        

# We want to pull in data from a week each month January - June in 2018 and 2019
# dates_
# dates = list(map(dt.datetime,))
dates = [dt.datetime(2018,1,13),dt.datetime(2018,2,10),dt.datetime(2018,3,10),dt.datetime(2018,4,14),\
         dt.datetime(2018,5,12),dt.datetime(2018,6,9),\
         dt.datetime(2019,1,12),dt.datetime(2019,2,9),dt.datetime(2019,3,9),dt.datetime(2019,4,13),\
        dt.datetime(2019,5,11),dt.datetime(2019,6,8)]
weeks = list_weeks(dates)
weeks

['180113',
 '180210',
 '180310',
 '180414',
 '180512',
 '180609',
 '190112',
 '190209',
 '190309',
 '190413',
 '190511',
 '190608']

In [33]:
# Read in the data for the desired weeks
def readTurnstileData(week_string):
    """
    This function reads in data from an online MTA Turnstile dataset into a DataFrame
    ---
    input: link to dataset
    output: DataFrame
    """
    cols = ['control_area','unit','scp','station','line_name','division','date','time',
        'desc','entries','exits']
    
    link = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_' + week_string + '.txt'
    turnstile_data = pd.read_csv(link, header = 0, names = cols)
    return turnstile_data

# Create a single DataFrame containing all weeks
def createTurnstileDataFrame(list_of_weeks):
    df = pd.DataFrame()
    df_chunk_list = []
    for date in list_of_weeks:
        df_chunk = readTurnstileData(date)
        df_chunk_list.append(df_chunk)
    df = pd.concat(df_chunk_list)
    return df

# Clean up date/time info
def formatDateTime(df):
    """
    This function converts the date and time into DateTime format in a single column
    and deletes the unformatted date and time columns
    
    Note: only run once per DataFrame, will result in error otherwise
    ---
    input: DataFrame
    output: DataFrame
    """
    
    #convert date and time to DateTime format in a single column
    df['concat_date_time'] = df['date'] + ' ' + df['time']
    df['date_time'] = pd.to_datetime(df.concat_date_time,format = '%m/%d/%Y %H:%M:%S')
    
    #delete unformatted date and time columns
    del df['concat_date_time']
    del df['date']
    del df['time']
    return df

def read_and_format_turnstile_data(list_of_weeks):
    """
    This function reads in Turnstile data from online and returns a DataFrame with 
    with the date and time information converted to a single DateTime column
    ---
    input: link
    output: DataFrame
    """
    df = createTurnstileDataFrame(list_of_weeks)
    df = formatDateTime(df)
    return df

In [84]:
# Read in the data from the specified weeks
orig_df = read_and_format_turnstile_data(weeks)

In [85]:
# Create a column for weekday
orig_df['weekday'] = orig_df[['date_time']].apply(lambda x: x['date_time'].dayofweek,axis=1)

In [86]:
# Create a column for the time of day
orig_df['hour'] = orig_df[['date_time']].apply(lambda x: x['date_time'].hour,axis=1)

In [87]:
# Create a column for year
orig_df['year'] = orig_df[['date_time']].apply(lambda x: x['date_time'].year,axis=1)

In [88]:
# Create a column with unique ID for each turnstile
orig_df['turnstile_id'] = orig_df.groupby(['control_area','unit','scp','station','year']).ngroup()

In [93]:
# Reset indexing on the DataFrame
orig_df.reset_index(inplace=True)

In [95]:
# Delete columns that won't be used
orig_df.drop(columns=['index','control_area','unit','scp','line_name','division','desc'],inplace=True)

In [104]:
# Save raw data to access easily later
with open('data/orig_df.pickle', 'wb') as to_write:
    pickle.dump(orig_df, to_write)

## Step 2ish: Read in Station Location Data

## Step 2.5: read in data from pickle

In [4]:
# Read in the data from the pickle file if you are starting here
# If you started from the beginning of the notebook, comment out the code below

with open('data/orig_df.pickle','rb') as read_file:
    orig_df = pickle.load(read_file)
df = orig_df

## Step 3: Organizing the Data

In [5]:
df.shape

(2409320, 8)

In [None]:
# Create columns for latitude/longitude and borough of each turnstile

## Step 3: Cleaning the Data

In [106]:
# Sort DataFrame by turnstile and date
df.sort_values(['turnstile_id','date_time'],inplace=True)
df.reset_index(drop=True,inplace=True)

# Find entries and exits differences per turnstile
df['exit_counts'] = abs(df.groupby('turnstile_id').exits.diff())
df['entry_counts'] = abs(df.groupby('turnstile_id').entries.diff())

In [107]:
df.describe()

Unnamed: 0,entries,exits,weekday,hour,year,turnstile_id,exit_counts,entry_counts
count,2409320.0,2409320.0,2409320.0,2409320.0,2409320.0,2409320.0,2399633.0,2399633.0
mean,40212080.0,33060110.0,2.99022,11.12374,2018.507,4856.233,25065.54,35223.21
std,208011500.0,192436600.0,1.99799,6.925758,0.4999505,2802.334,5185810.0,6675077.0
min,0.0,0.0,0.0,0.0,2018.0,0.0,0.0,0.0
25%,458752.0,203740.0,1.0,5.0,2018.0,2445.0,9.0,11.0
50%,2426018.0,1377258.0,3.0,11.0,2019.0,4840.0,53.0,77.0
75%,6862688.0,4774284.0,5.0,17.0,2019.0,7292.0,174.0,255.0
max,2130144000.0,2145850000.0,6.0,23.0,2019.0,9686.0,2145033000.0,2088495000.0


Max values on exit and entry counts are way too high

In [108]:
def findOutliers(df,column):
    """
    Returns outliers above the max limit for a column in a dataframe
    Adjust outlier cutoff to q75 + 4*iqr to include more data
    ---
    input: DataFrame, column
    output: DataFrame
    """
    q25,q50,q75 = df[column].quantile(q=[0.25,0.5,0.75])
    iqr = q75-q25
    #max limits to be considered an outlier
    max = q75 + 4*iqr
    #identify the points
    outlier_mask = [True if x > max else False for x in df[column]]
    print('{} outliers found out of {} data points, {}% of the data'.format(sum(outlier_mask),len(df[column]),100*(sum(outlier_mask)/len(df[column]))))
    return outlier_mask

In [109]:
#Get outliers for entries
print('Entry: ')
df['entry_outliers'] = findOutliers(df,'entry_counts')

#Get outliers for exits
print('\nExit:')
df['exit_outliers'] = findOutliers(df,'exit_counts')

#DataFrame with entry outliers removed
clean_df_entries = df.loc[~df['entry_outliers']]
print('\n{} points on entry side left after removing entry_counts outlier points'.format(clean_df_entries.shape[0]))

#DataFrame with exit outliers removed
clean_df_exits = df.loc[~df['exit_outliers']]
print('{} points on exit side left left after removing exit_counts outlier points'.format(clean_df_exits.shape[0]))


Entry: 
64486 outliers found out of 2409320 data points, 2.676522836318961% of the data

Exit:
92339 outliers found out of 2409320 data points, 3.832575166437003% of the data

2344834 points on entry side left after removing entry_counts outlier points
2316981 points on exit side left left after removing exit_counts outlier points


In [110]:
# Eliminate Null values
# Delete rows with null values for entry_counts
clean_df_entries = clean_df_entries[~clean_df_entries.entry_counts.isnull()]
print('{} points left after removing entry_counts Nan values'.format(clean_df_entries.shape[0]))

# Delete rows with null values for exit_counts
clean_df_entries = clean_df_entries[~clean_df_entries.exit_counts.isnull()]
print('{} points left after removing entry_counts Nan values'.format(clean_df_exits.shape[0]))


2335147 points left after removing entry_counts Nan values
2316981 points left after removing entry_counts Nan values


In [111]:
# Merge the two DataFrames
clean_df = clean_df_entries.merge(clean_df_exits,left_on=list(clean_df_entries.columns), \
                                  right_on=list(clean_df_exits.columns),how='inner')
#Add a column for total traffic at each turnstile
clean_df['total_traffic'] = clean_df['entry_counts'] + clean_df['exit_counts']

In [112]:
print('{} rows left after cleaning the data, {}% of the original'.format(clean_df.shape[0],100*(clean_df.shape[0]/df.shape[0])))

2284157 rows left after cleaning the data, 94.80504872744177% of the original


In [113]:
# Delete outlier ID columns
# Only run this cell once
del clean_df['entry_outliers']
del clean_df['exit_outliers']

In [114]:
# Save cleaned data to access easily later
with open('data/clean_df.pickle', 'wb') as to_write:
    pickle.dump(clean_df, to_write)


## Step 4.5: Read in the cleaned data

In [51]:
# Read in the data from the pickle file if you are starting here
# If you started from the beginning of the notebook, comment out the code below

import pickle

with open('data/clean_df.pickle','rb') as read_file:
    clean_df = pickle.load(read_file)