## Step 1: Importing Libraries

In [288]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt 

## Step 2: Reading in and Organizing the Data

In [None]:
# Define the weeks to retrieve from the MTA database
def weeks_list(list_of_dates):
    strptime()

# We want to pull in data from a week in mid-January and mid-May of 2018 and 2019


In [60]:
def readTurnstileData(link):
    """
    This function reads in data from an online MTA Turnstile dataset into a DataFrame
    ---
    input: link to dataset
    output: DataFrame
    """
    cols = ['control_area','unit','scp','station','line_name','division','date','time',
        'desc','entries','exits']
    
    turnstile_data = pd.read_csv(link, header = 0, names = cols)
    return turnstile_data

def formatDateTime(df):
    """
    This function converts the date and time into DateTime format in a single column
    and deletes the unformatted date and time columns
    
    Note: only run once per DataFrame, will result in error otherwise
    ---
    input: DataFrame
    output: DataFrame
    """
    
    #convert date and time to DateTime format in a single column
    df['concat_date_time'] = df['date'] + ' ' + df['time']
    df['date_time'] = pd.to_datetime(df.concat_date_time,format = '%m/%d/%Y %H:%M:%S')
    
    #delete unformatted date and time columns
    del df['concat_date_time']
    del df['date']
    del df['time']
    return df

def read_and_format_turnstile_data(link):
    """
    This function reads in Turnstile data from online and returns a DataFrame with 
    with the date and time information converted to a single DateTime column
    ---
    input: link
    output: DataFrame
    """
    df = readTurnstileData(link)
    df1 = formatDateTime(df)
    return df1

In [115]:
# Read in one of the turnstile datasets
turnstile_data_link_1 = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_190921.txt'
turnstile_data_1 = read_and_format_turnstile_data(turnstile_data_link_1)
df = turnstile_data_1

In [116]:
# Create a column for weekday
df['weekday'] = df[['date_time']].apply(lambda x: x['date_time'].dayofweek,axis=1)

In [117]:
# Create a column for year
df['year'] = df[['date_time']].apply(lambda x: x['date_time'].year,axis=1)

In [120]:
# Create unique ID for each turnstile
df['turnstile_id'] = df.groupby(['control_area','unit','scp','station','year']).ngroup()

## Step 3: Cleaning the Data

In [163]:
# Sort DataFrame by turnstile and date
df.sort_values(['turnstile_id','date_time'],inplace=True)
df.reset_index(drop=True,inplace=True)

# Find entries and exits differences per turnstile
df['exit_counts'] = abs(df.groupby('turnstile_id').exits.diff())
df['entry_counts'] = abs(df.groupby('turnstile_id').entries.diff())
    

In [194]:
df.describe()

Unnamed: 0,entries,exits,weekday,year,turnstile_id,exit_counts,entry_counts
count,204928.0,204928.0,204928.0,204928.0,204928.0,200039.0,200039.0
mean,42105050.0,34874230.0,2.996438,2019.0,2443.410051,3718.126,989.2405
std,215575700.0,199548800.0,1.999035,0.0,1413.229378,1569443.0,237730.9
min,0.0,0.0,0.0,2019.0,0.0,0.0,0.0
25%,289809.2,129434.8,1.0,2019.0,1220.0,9.0,9.0
50%,2040950.0,1179450.0,3.0,2019.0,2435.0,56.0,78.0
75%,6618936.0,4535505.0,5.0,2019.0,3674.0,180.0,255.0
max,2129093000.0,2123772000.0,6.0,2019.0,4888.0,701915000.0,100260100.0


Max values on exit and entry counts are way too high

In [239]:
def findOutliers(df,column):
    """
    Returns outliers above the max limit for a column in a dataframe
    ---
    input: DataFrame, column
    output: DataFrame
    """
    q25,q50,q75 = df[column].quantile(q=[0.25,0.5,0.75])
    iqr = q75-q25
    #max limits to be considered an outlier
    max = q75 + 1.5*iqr
    #identify the points
    outlier_mask = [True if x > max else False for x in df[column]]
    print('{} outliers found out of {} data points'.format(sum(outlier_mask),len(df[column])))
    return outlier_mask

In [284]:
#Get outliers for entries
print('Entry: ')
df['entry_outliers'] = findOutliers(df,'entry_counts')

#DataFrame with entry outliers removed
clean_df_entries = df.loc[~df['entry_outliers']]
print('{} points left after removing entry_counts outlier points'.format(clean_df_entries.shape[0]))

#Get outliers for exits
print('\nExit:')
df['exit_outliers'] = findOutliers(df,'exit_counts')

#DataFrame with exit outliers removed
clean_df_exits = df.loc[~df['exit_outliers']]
print('{} points left after removing exit_counts outlier points'.format(clean_df_exits.shape[0]))


Entry: 
13597 outliers found out of 204928 data points
191331 points left after removing entry_counts outlier points

Exit:
17196 outliers found out of 204928 data points
187732 points left after removing exit_counts outlier points


In [277]:
# Eliminate Null values
# Delete rows with null values for entry_counts
clean_df_entries = clean_df_entries[~clean_df_entries.entry_counts.isnull()]
print('{} points left after removing entry_counts NaN values'.format(clean_df_entries.shape[0]))

# Delete rows with null values for exit_counts
clean_df_entries = clean_df_entries[~clean_df_entries.entry_counts.isnull()]
print('{} points left after removing entry_counts NaN values'.format(clean_df_entries.shape[0]))


186442 points left after removing entry_counts Nan values


In [287]:
# Merge the two DataFrames
clean_df = clean_df_entries.merge(clean_df_exits,left_on=list(clean_df_entries.columns), \
                                  right_on=list(clean_df_exits.columns),how='inner')
#Add a column for total traffic at each turnstile
clean_df['total_traffic'] = clean_df['entry_counts'] + clean_df['exit_counts']

(176693, 20)

## Analyzing the Data

- Which station has the most foot traffic:
    - On weekdays in particular?
    - On which days?
    - At what times?


- Which stations are near college campuses?
- Which stations are in tech-heavy areas?
- Which stations are in wealthy neighborhoods?

In [None]:
station_groupby = 

In [51]:
station_groupby = turnstile_data.groupby('station')

In [54]:
station_entries = station_groupby.entries.sum().sort_values(ascending = True)
station_entries

station
NEWARK HM HE            1073019
PATH WTC 2              7402804
NEWARK HW BMEBE        16359580
9TH STREET             18546632
ORCHARD BEACH          20787564
                       ...     
DEKALB AV          226731273077
23 ST              237675376663
TIMES SQ-42 ST     244851205878
125 ST             282278333960
42 ST-PORT AUTH    315905669087
Name: entries, Length: 378, dtype: int64