## Lesson 3: Numpy and Pandas for 2D Data

### Quiz: Subway Data

In [3]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

# Change False to True for each block of code to see what it does

# Accessing elements
if False:
    print ridership[1, 3]
    print ridership[1:3, 3:5]
    print ridership[1, :]
    
# Vectorized operations on rows or columns
if False:
    print ridership[0, :] + ridership[1, :]
    print ridership[:, 0] + ridership[:, 1]
    
# Vectorized operations on entire arrays
if False:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print a + b

def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    max_first_day = ridership[0,:].argmax() # returns 3 (5)
    overall_mean = ridership[:,:].mean()
    mean_for_max = ridership[:,max_first_day].mean()
    
    return (overall_mean, mean_for_max)
    
print mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9000000000001)


I had flipped my rows/columns! (in max_first_day and mean_for_max) Otherwise my answer is nearly identical to the video.

### Quiz: Numpy Axis

In [1]:
import numpy as np

# Change False to True for this block of code to see what it does

# NumPy axis argument
if False:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print a.sum()
    print a.sum(axis=0)
    print a.sum(axis=1)
    
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean = ridership.mean(axis=0)
    max_daily_ridership = mean.max()
    min_daily_ridership = mean.min()
    
    return (max_daily_ridership, min_daily_ridership)
    
print min_and_max_riders_per_day(ridership)

(3239.9000000000001, 1071.2)


This worked! And was identical to the video answer

### Quiz: Accessing Elements of a DataFrame

In [3]:
import pandas as pd

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Change False to True for each block of code to see what it does

# DataFrame creation
if False:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2
   

# Accessing elements
if False:
    print ridership_df.iloc[0]
    print ridership_df.loc['05-05-11']
    print ridership_df['R003']
    print ridership_df.iloc[1, 3]
    
# Accessing multiple rows
if False:
    print ridership_df.iloc[1:4]
    
# Accessing multiple columns
if False:
    print ridership_df[['R003', 'R005']]
    
# Pandas axis
if False:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df.sum()
    print df.sum(axis=1)
    print df.values.sum()
    
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    station_with_max_first_day = ridership.iloc[0,:].argmax()
    overall_mean = ridership.values.mean()
    mean_for_max = ridership.loc[:,station_with_max_first_day].mean()
    
    return (overall_mean, mean_for_max)
    
print mean_riders_for_max_station(ridership_df)

(2342.5999999999999, 3239.9)


This worked! Notice how the colons are not necessary in the video answer below. I think this is because this DataFrame has indexes and column names and specifying one lets it know which (col or row) you are  referencing.

In [5]:
def mean_riders_for_max_station2(ridership):
    max_station = ridership.iloc[0].argmax() # : is not necessary here
    mean_for_max = ridership[max_station].mean()
    overall_mean = ridership.values.mean()
    
    return (overall_mean, mean_for_max)
    
print mean_riders_for_max_station2(ridership_df)

(2342.5999999999999, 3239.9)


### Loading Data into a DataFrame

In [6]:
import pandas as pd

subway_df = pd.read_csv('nyc_subway_weather.csv')

In [7]:
print subway_df.head()

   UNIT     DATEn     TIMEn  ENTRIESn   EXITSn  ENTRIESn_hourly  \
0  R003  05-01-11  00:00:00   4388333  2911002              0.0   
1  R003  05-01-11  04:00:00   4388333  2911002              0.0   
2  R003  05-01-11  12:00:00   4388333  2911002              0.0   
3  R003  05-01-11  16:00:00   4388333  2911002              0.0   
4  R003  05-01-11  20:00:00   4388333  2911002              0.0   

   EXITSn_hourly             datetime  hour  day_week     ...       pressurei  \
0            0.0  2011-05-01 00:00:00     0         6     ...           30.22   
1            0.0  2011-05-01 04:00:00     4         6     ...           30.25   
2            0.0  2011-05-01 12:00:00    12         6     ...           30.28   
3            0.0  2011-05-01 16:00:00    16         6     ...           30.26   
4            0.0  2011-05-01 20:00:00    20         6     ...           30.28   

  rain  tempi  wspdi meanprecipi  meanpressurei  meantempi  meanwspdi  \
0    0   55.9    3.5         0.0     

In [8]:
subway_df.describe()

Unnamed: 0,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,hour,day_week,weekday,latitude,longitude,fog,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
count,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,...,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0
mean,28124860.0,19869930.0,1886.589955,1361.487866,10.046754,2.905719,0.714436,40.724647,-73.940364,0.009824,...,29.971096,0.224741,63.10378,6.927872,0.004618,29.971096,63.10378,6.927872,40.728555,-73.938693
std,30436070.0,20289860.0,2952.385585,2183.845409,6.938928,2.079231,0.451688,0.07165,0.059713,0.098631,...,0.137942,0.417417,8.455597,4.510178,0.016344,0.131158,6.939011,3.179832,0.06542,0.059582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.576152,-74.073622,0.0,...,29.55,0.0,46.9,0.0,0.0,29.59,49.4,0.0,40.600204,-74.01487
25%,10397620.0,7613712.0,274.0,237.0,4.0,1.0,0.0,40.677107,-73.987342,0.0,...,29.89,0.0,57.0,4.6,0.0,29.913333,58.283333,4.816667,40.688591,-73.98513
50%,18183890.0,13316090.0,905.0,664.0,12.0,3.0,1.0,40.717241,-73.953459,0.0,...,29.96,0.0,61.0,6.9,0.0,29.958,60.95,6.166667,40.72057,-73.94915
75%,32630490.0,23937710.0,2255.0,1537.0,16.0,5.0,1.0,40.759123,-73.907733,0.0,...,30.06,0.0,69.1,9.2,0.0,30.06,67.466667,8.85,40.755226,-73.912033
max,235774600.0,149378200.0,32814.0,34828.0,20.0,6.0,1.0,40.889185,-73.755383,1.0,...,30.32,1.0,86.0,23.0,0.1575,30.293333,79.8,17.083333,40.862064,-73.694176


### Quiz: Calculating Correlation

In [9]:
import pandas as pd

filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    x_standardized = (x - x.mean()) / x.std(ddof=0)
    y_standardized = (y - y.mean()) / y.std(ddof=0)
    multiply = x_standardized * y_standardized
    r = multiply.mean()
    return r

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)

print correlation(entries, cum_entries)

0.0356485157722
-0.0266933483216
-0.229034323408
0.585895470766


This worked! Video answer below:

In [10]:
import pandas as pd

filename = 'nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

def correlation2(x, y):
    std_x = (x - x.mean()) / x.std(ddof=0)
    std_y = (y - y.mean()) / y.std(ddof=0)
    
    return (std_x * std_y).mean()

print correlation2(entries, rain)

0.0356485157722


Numpy has a built-in function to calculate Pearson's r. In the future, use this!