# Exploratory Data Analysis

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker

## Bus Daily Ridership

First, open the CSV file and format the dates.

In [None]:
df = pd.read_csv(
    filepath_or_buffer='data/cta_ridership_bus_daily.csv',
    dtype={'route': str, 'date': str, 'daytype': str, 'rides': int}
)
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df = df.set_index('date')
df.head()

Let's look at summary statistics for rides.

In [None]:
df.groupby('daytype').describe()

In [None]:
df.hist('rides')

Next we'll look at a time-series plot.

In [None]:
groups = df.groupby(pd.Grouper(freq='M'))

fig, ax = plt.subplots()

fig.set_figwidth(15)

ax.ticklabel_format(style='plain')
ax.set_ylim(0, 3.5e7)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.title('CTA Bus Ridership by Month 2000-2023')
plt.xlabel('Date')
plt.ylabel('Rides')

ax.plot(groups.groups.keys(), groups.sum('rides'))

## Train Daily Ridership

In [None]:
df = pd.read_csv(
    filepath_or_buffer='data/cta_ridership_train_daily.csv'#,
#     dtype={'route': str, 'date': str, 'daytype': str, 'rides': int}
)
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df = df.set_index(['date', 'station_id'])
df.head()

In [None]:
df = df.reset_index(level='station_id')
groups = df.groupby(pd.Grouper(freq='M'))

fig, ax = plt.subplots()

fig.set_figwidth(15)

ax.ticklabel_format(style='plain')
ax.set_ylim(0, 2e7)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.title('CTA Train Ridership by Month 2000-2023')
plt.xlabel('Date')
plt.ylabel('Rides')

ax.plot(groups.groups.keys(), groups.sum('rides'))