# Numpy and Pandas for 2d arrays.

## Analysing subway weather data.

### 1. Finding mean, max.

In [1]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def mean_riders_for_max_station(ridership):
    '''
    This function finds the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    returns the mean ridership overall for comparsion.
    '''
    first_day_max_index = ridership[0].argmax()
    overall_mean = ridership.mean() # Replace this with your code
    mean_for_max = ridership[:,first_day_max_index].mean() # Replace this with your code
    
    return (overall_mean, mean_for_max)
print(mean_riders_for_max_station(ridership))

(2342.6, 3239.9)


### 2. Using axis.

In [2]:
def min_and_max_riders_per_day(ridership):
    '''
    Returns the maximum and minimum of the mean ridership 
    per day for any subway station.
    '''
    overall_mean = ridership.mean(axis = 0)
    max_daily_ridership = overall_mean.max()     # Replace this with your code
    min_daily_ridership = overall_mean.min()      # Replace this with your code
    
    return (max_daily_ridership, min_daily_ridership)
print(min_and_max_riders_per_day(ridership))

(3239.9, 1071.2)


### 3. Pandas DataFrame.

In [4]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
print(ridership_df)

          R003  R004  R005  R006  R007
05-01-11     0     0     2     5     0
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613
05-05-11  1608  4802  3932  4477  2705
05-06-11  1576  3933  3909  4979  2685
05-07-11    95   229   255   496   201
05-08-11     2     0     1    27     0
05-09-11  1438  3785  3589  4174  2215
05-10-11  1342  4043  4009  4665  3033


In [22]:
maxval = ridership_df.iloc[0].idxmax()
maxval

'R006'

In [23]:
def mean_riders_for_max_station(ridership):
    '''
    This function finds the station with the maximum riders on the
    first day, then returns the mean riders per day for that station. Also
    returns the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    first_day_max_station = ridership.iloc[0].idxmax()
    overall_mean = ridership.values.mean() # Replace this with your code
    mean_for_max = ridership[first_day_max_station].mean() # Replace this with your code
    
    return (overall_mean, mean_for_max)
mean_riders_for_max_station(ridership_df)

(2342.6, 3239.9)