Topic: Challenge Set 1
Subject: Explore MTA turnstile data
Date: 7/6/2020
Name: Anterra Kennedy
Worked with: Nick Horton, Sasha Prokhorova

### **Challenge 1**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [22]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
# weeks: sept 7th to december 7th, 2019    
week_nums = [190907, 190914, 190921]
df = get_data(week_nums)

### **Challenge 2**

In [23]:
df["DATETIME"] = pd.to_datetime(df["DATE"] + " " + df["TIME"], format="%m/%d/%Y %H:%M:%S")

In [24]:
df["DATE"] = pd.to_datetime(df["DATE"], format="%m/%d/%Y")
df["TIME"] = pd.to_datetime(df["TIME"])

### **Challenge 3**

In [25]:
df.rename(columns={'EXITS                                                               ':"EXITS"}, inplace=True)

In [26]:
df.reset_index(inplace=True)
real_entries = df.groupby(["C/A", "UNIT", "SCP"]).agg({"ENTRIES": "diff"})
real_exits = df.groupby(["C/A", "UNIT", "SCP"]).agg({"EXITS": "diff"})

In [27]:
df["REAL_ENTRIES"] = real_entries["ENTRIES"]
df["REAL_EXITS"] = real_exits["EXITS"]

In [28]:
df = df.loc[df["REAL_ENTRIES"] >= 0]
df = df.loc[df["REAL_EXITS"] >= 0]

In [29]:
station_averages = df.groupby(["STATION", "LINENAME"])[["REAL_ENTRIES", "REAL_EXITS"]].median()
station_averages.reset_index(inplace=True)

In [30]:
station_averages.rename(columns={"REAL_ENTRIES":"AVERAGE_ENTRIES"}, inplace=True)
station_averages.rename(columns={"REAL_EXITS":"AVERAGE_EXITS"}, inplace=True)

In [31]:
df = pd.merge(left=df, right=station_averages, left_on=["STATION", "LINENAME"], right_on=["STATION", "LINENAME"])

In [32]:
df = df.loc[df["REAL_ENTRIES"] < 100*df["AVERAGE_ENTRIES"]]
df = df.loc[df["REAL_EXITS"] < 100*df["AVERAGE_EXITS"]]

### **Challenge 9**

In [33]:
df["TRAFFIC"] = df["REAL_ENTRIES"] + df["REAL_EXITS"]

In [34]:
total_station_traffic = df.groupby(["STATION", "LINENAME"])[["TRAFFIC"]].sum()
total_station_traffic.reset_index(inplace=True)

In [35]:
total_station_traffic.sort_values(by="TRAFFIC", ascending=False).head(20)

Unnamed: 0,STATION,LINENAME,TRAFFIC
311,GRD CNTRL-42 ST,4567S,4674682.0
89,34 ST-HERALD SQ,BDFMNQRW,4135182.0
93,34 ST-PENN STA,ACE,2813152.0
102,42 ST-PORT AUTH,ACENQRS1237W,2601006.0
28,14 ST-UNION SQ,LNQR456W,2462741.0
441,TIMES SQ-42 ST,1237ACENQRSW,2301817.0
301,FULTON ST,2345ACJZ,2158137.0
288,FLUSHING-MAIN,7,2100806.0
106,47-50 STS ROCK,BDFM,2048829.0
125,59 ST COLUMBUS,ABCD1,2014126.0
