# Doing more with data: `numpy`
## Intro Python

18 July 2022

## Last Session...
* **Lists:** sorting with `.sort()` vs `sorted()`
* **Iteration:** `for` and `while` loops, `break` and `continue`
* **Dictionaries:** creating, updating, navigating
* **More collections:** tuples and sets
* **Modules:** a quick tour of built-in modules like `math`, `re`, `datetime`
* **Navigating folders** with `os`
* **Working with files**: `with open(file_path, 'r') as f:`...
* **Concepts:** object-oriented programming

## `numpy`

In [1]:
import numpy as np

In [2]:
a = np.array([1, 2, 3])
a

array([1, 2, 3])

In [3]:
b = np.array([[1, 2, 3],
             [3, 2, 1]])
b

array([[1, 2, 3],
       [3, 2, 1]])

In [4]:
a.ndim

1

In [5]:
b.ndim

2

In [6]:
b.shape

(2, 3)

In [7]:
b.size

6

In [8]:
np.zeros((4, 3, 3))

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [9]:
# we can't do decimal steps with range()
range(0, 1, .1)

TypeError: 'float' object cannot be interpreted as an integer

In [10]:
# but we can with np.arange()
np.arange(0, 1, .1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [11]:
np.random.seed(1)
np.random.randint(1, 100, (3, 4))

array([[38, 13, 73, 10],
       [76,  6, 80, 65],
       [17,  2, 77, 72]])

In [12]:
# create a sample generator
rng = np.random.default_rng()

# use the generator to draw a sample from a normal distribution
normal_sample = rng.normal(100,  # mean
                           10,  # standard deviation
                           (5, 4)  # array rows x cols
                          )
normal_sample

array([[111.42052375,  87.41586145,  91.80017919, 111.11284484],
       [ 94.23864947, 103.67748832, 100.33375225,  88.63908228],
       [ 93.20191937, 101.46173862,  87.57965832,  93.50709786],
       [ 80.60680639,  88.98625846, 114.4597452 , 103.690574  ],
       [ 98.54728546, 103.60491473, 106.32104309, 103.68025833]])

In [13]:
normal_sample.reshape(10, 2)

array([[111.42052375,  87.41586145],
       [ 91.80017919, 111.11284484],
       [ 94.23864947, 103.67748832],
       [100.33375225,  88.63908228],
       [ 93.20191937, 101.46173862],
       [ 87.57965832,  93.50709786],
       [ 80.60680639,  88.98625846],
       [114.4597452 , 103.690574  ],
       [ 98.54728546, 103.60491473],
       [106.32104309, 103.68025833]])

In [14]:
# transpose
normal_sample.T

array([[111.42052375,  94.23864947,  93.20191937,  80.60680639,
         98.54728546],
       [ 87.41586145, 103.67748832, 101.46173862,  88.98625846,
        103.60491473],
       [ 91.80017919, 100.33375225,  87.57965832, 114.4597452 ,
        106.32104309],
       [111.11284484,  88.63908228,  93.50709786, 103.690574  ,
        103.68025833]])

In [15]:
normal_sample.flatten()

array([111.42052375,  87.41586145,  91.80017919, 111.11284484,
        94.23864947, 103.67748832, 100.33375225,  88.63908228,
        93.20191937, 101.46173862,  87.57965832,  93.50709786,
        80.60680639,  88.98625846, 114.4597452 , 103.690574  ,
        98.54728546, 103.60491473, 106.32104309, 103.68025833])

In [16]:
arr1 = np.array([5, 10, 15, 20])
arr2 = np.arange(5, 9)

arr2

array([5, 6, 7, 8])

In [17]:
arr1 - arr2

array([ 0,  4,  8, 12])

In [18]:
# make a 5x4 matrix
rect = np.arange(1, 21).reshape(5, 4)
rect

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16],
       [17, 18, 19, 20]])

In [19]:
normal_sample - rect

array([[110.42052375,  85.41586145,  88.80017919, 107.11284484],
       [ 89.23864947,  97.67748832,  93.33375225,  80.63908228],
       [ 84.20191937,  91.46173862,  76.57965832,  81.50709786],
       [ 67.60680639,  74.98625846,  99.4597452 ,  87.690574  ],
       [ 81.54728546,  85.60491473,  87.32104309,  83.68025833]])

In [20]:
rect * 2

array([[ 2,  4,  6,  8],
       [10, 12, 14, 16],
       [18, 20, 22, 24],
       [26, 28, 30, 32],
       [34, 36, 38, 40]])

In [21]:
rect + np.array([1, 2])

ValueError: operands could not be broadcast together with shapes (5,4) (2,) 

In [22]:
arr1.mean()

12.5

In [23]:
arr1.sum()

50

In [24]:
rect.mean()

10.5

In [25]:
rect.mean(axis=0)  # means by column

array([ 9., 10., 11., 12.])

In [26]:
rect.mean(axis=1)  # means by row

array([ 2.5,  6.5, 10.5, 14.5, 18.5])

In [27]:
rect

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16],
       [17, 18, 19, 20]])

In [28]:
rect.shape

(5, 4)

In [29]:
small_rect = np.array([[2, 3, 4],
                     [9, 8, 7]])
small_rect.shape

(2, 3)

In [30]:
rect + small_rect

ValueError: operands could not be broadcast together with shapes (5,4) (2,3) 

In [31]:
arr1.shape

(4,)

In [32]:
print(arr1)
print(rect)
rect + arr1

[ 5 10 15 20]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]
 [17 18 19 20]]


array([[ 6, 12, 18, 24],
       [10, 16, 22, 28],
       [14, 20, 26, 32],
       [18, 24, 30, 36],
       [22, 28, 34, 40]])

In [33]:
nan_arr = np.array([5, 6, 7, np.nan])
nan_arr

array([ 5.,  6.,  7., nan])

In [34]:
arr1 + nan_arr

array([10., 16., 22., nan])

### Array slicing

In [35]:
arr1[1]

10

In [36]:
arr1[1:3]

array([10, 15])

In [37]:
for i in arr1:
    print(i)

5
10
15
20


In [38]:
rect

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16],
       [17, 18, 19, 20]])

In [39]:
rect[1, 1]

6

In [40]:
# slicing by index
# row indices 2 and 3
# col indices 1 and 2
rect[2:4, 1:3]

array([[10, 11],
       [14, 15]])

In [41]:
rect[0]

array([1, 2, 3, 4])

In [42]:
rect[:, 0]

array([ 1,  5,  9, 13, 17])

In [43]:
# numpy will make all values the same dtype
np.array([1, 2, 3.14]).dtype

dtype('float64')

In [44]:
np.array([1, 2, '3'])

array(['1', '2', '3'], dtype='<U11')

### Mutability

In [45]:
matrix = np.random.randint(1, 11, 12).reshape(3, 4)
matrix

array([[ 7, 10,  3,  5],
       [ 6,  3,  5,  3],
       [ 5,  8,  8, 10]])

In [46]:
matrix2 = matrix
matrix3 = matrix.copy()

In [47]:
matrix[2] = [0, 0, 0, 0]
matrix[:, 3] = [1, 1, 1]
matrix

array([[ 7, 10,  3,  1],
       [ 6,  3,  5,  1],
       [ 0,  0,  0,  1]])

In [48]:
matrix2

array([[ 7, 10,  3,  1],
       [ 6,  3,  5,  1],
       [ 0,  0,  0,  1]])

In [49]:
matrix3

array([[ 7, 10,  3,  5],
       [ 6,  3,  5,  3],
       [ 5,  8,  8, 10]])

### Logical operations

In [50]:
rect

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16],
       [17, 18, 19, 20]])

In [51]:
rect % 3 == 0

array([[False, False,  True, False],
       [False,  True, False, False],
       [ True, False, False,  True],
       [False, False,  True, False],
       [False,  True, False, False]])

In [52]:
# use a boolean expression to filter an array
rect[rect % 3 == 0]

array([ 3,  6,  9, 12, 15, 18])

In [53]:
# also works
mask = rect % 3 == 0
rect[mask]

array([ 3,  6,  9, 12, 15, 18])

In [54]:
# for loop version of mask

masked = []
for row in rect:
    masked_row = []
    for value in row:
        masked_row.append(value % 3 == 0)
    masked.append(masked_row)
    
masked

[[False, False, True, False],
 [False, True, False, False],
 [True, False, False, True],
 [False, False, True, False],
 [False, True, False, False]]

In [55]:
np.where(rect % 3 == 0,  # condition
         rect,  # value to put if True
         0)  # value to put if false

array([[ 0,  0,  3,  0],
       [ 0,  6,  0,  0],
       [ 9,  0,  0, 12],
       [ 0,  0, 15,  0],
       [ 0, 18,  0,  0]])

In [56]:
# for loop version of np.where

result = []

for row in rect:
    result_row = []
    for value in row:
        if value % 3 == 0:
            result_row.append(value)
        else:
            result_row.append(0)
    result.append(result_row)
    

result   

[[0, 0, 3, 0], [0, 6, 0, 0], [9, 0, 0, 12], [0, 0, 15, 0], [0, 18, 0, 0]]

In [57]:
# there are additional math functions and constants in numpy
np.sin(np.pi)

1.2246467991473532e-16

### Loading data to `numpy` arrays from file

In [58]:
with open('../01-slides/sample_data/california_housing_test.csv', 'r') as f:
    for i in range(5):
        print(f.readline())

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value

-122.05,37.37,27,3885,661,1537,606,6.6085,344700

-118.3,34.26,43,1510,310,809,277,3.599,176500

-117.81,33.78,27,3589,507,1484,495,5.7934,270500

-118.36,33.82,28,67,15,49,11,6.1359,330000



In [59]:
hd = np.loadtxt('../01-slides/sample_data/california_housing_test.csv',
               delimiter=',',  # we have to specify the delimiter
               skiprows=1  # skip the header row
               )
hd

array([[-1.22050e+02,  3.73700e+01,  2.70000e+01, ...,  6.06000e+02,
         6.60850e+00,  3.44700e+05],
       [-1.18300e+02,  3.42600e+01,  4.30000e+01, ...,  2.77000e+02,
         3.59900e+00,  1.76500e+05],
       [-1.17810e+02,  3.37800e+01,  2.70000e+01, ...,  4.95000e+02,
         5.79340e+00,  2.70500e+05],
       ...,
       [-1.19700e+02,  3.63000e+01,  1.00000e+01, ...,  2.20000e+02,
         2.28950e+00,  6.20000e+04],
       [-1.17120e+02,  3.41000e+01,  4.00000e+01, ...,  1.40000e+01,
         3.27080e+00,  1.62500e+05],
       [-1.19630e+02,  3.44200e+01,  4.20000e+01, ...,  2.60000e+02,
         8.56080e+00,  5.00001e+05]])

In [60]:
hd.shape

(3000, 9)

In [61]:
hd2 = np.genfromtxt('../01-slides/sample_data/california_housing_test.csv',
                   delimiter=',',
                   names=True  # use the first row as column names
                   )
hd2

array([(-122.05, 37.37, 27., 3885., 661., 1537., 606., 6.6085, 344700.),
       (-118.3 , 34.26, 43., 1510., 310.,  809., 277., 3.599 , 176500.),
       (-117.81, 33.78, 27., 3589., 507., 1484., 495., 5.7934, 270500.),
       ...,
       (-119.7 , 36.3 , 10.,  956., 201.,  693., 220., 2.2895,  62000.),
       (-117.12, 34.1 , 40.,   96.,  14.,   46.,  14., 3.2708, 162500.),
       (-119.63, 34.42, 42., 1765., 263.,  753., 260., 8.5608, 500001.)],
      dtype=[('longitude', '<f8'), ('latitude', '<f8'), ('housing_median_age', '<f8'), ('total_rooms', '<f8'), ('total_bedrooms', '<f8'), ('population', '<f8'), ('households', '<f8'), ('median_income', '<f8'), ('median_house_value', '<f8')])

In [62]:
hd2.shape  # we got a structured array this time

(3000,)

In [63]:
hd2['population']

array([1537.,  809., 1484., ...,  693.,   46.,  753.])

In [64]:
hd2['population'].mean()

1402.7986666666666

## `pandas`

In [65]:
import pandas as pd

# make all columns display, even for wide datasets
pd.set_option('display.max_column', None)

In [66]:
trees = pd.DataFrame({
    'name': ['maple', 'oak', 'ash', 'fir'],
    'avg_lifespan': [300, 100, 260, 450],
    'quantity': [23, 43, 74, 100]
})
trees.set_index('name')

Unnamed: 0_level_0,avg_lifespan,quantity
name,Unnamed: 1_level_1,Unnamed: 2_level_1
maple,300,23
oak,100,43
ash,260,74
fir,450,100


In [67]:
tree_types = pd.Series(['deciduous', 'deciduous', 'deciduous', 'evergreen'], 
                       name='foliage')
tree_types

0    deciduous
1    deciduous
2    deciduous
3    evergreen
Name: foliage, dtype: object

In [68]:
ttc = 'https://github.com/amfz/dsi-python-workshop/raw/main/data/ttc-subway-delay-data-2021.xlsx'

In [69]:
# we get a dictionary of sheet_name:data df pairs
delays = pd.read_excel(ttc, 
                       engine='openpyxl',  # optional
                       sheet_name=None  # load all sheets -- default is first sheet only
                      )

In [70]:
# create an empty dataframe
all_delays = pd.DataFrame()

# loop through key:value pairs
for sheet_name, df in delays.items():
    print(f'Adding {df.shape} from {sheet_name}')
    all_delays = pd.concat([all_delays, df],  # list of data frames to concatenate
                           axis=0,  # append row-wise
                           ignore_index=True  # reset row labels to renumber rows
                          )
    
all_delays.shape

Adding (1216, 10) from January21
Adding (1245, 10) from Feb 21
Adding (1167, 10) from March '21
Adding (1170, 10) from April '21
Adding (1168, 10) from May '21
Adding (1265, 10) from June 21
Adding (1244, 10) from July 21
Adding (1273, 10) from August 21
Adding (1433, 10) from Sept 21
Adding (1560, 10) from Oct 21
Adding (1771, 10) from Nov 21
Adding (1858, 10) from December21


(16370, 10)

In [71]:
all_delays.head(3)

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
0,2021-01-01,00:33,Friday,BLOOR STATION,MUPAA,0,0,N,YU,6046
1,2021-01-01,00:39,Friday,SHERBOURNE STATION,EUCO,5,9,E,BD,5250
2,2021-01-01,01:07,Friday,KENNEDY BD STATION,EUCD,5,9,E,BD,5249


In [72]:
all_delays.tail()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
16365,2021-12-31,01:10,Friday,MUSEUM STATION,SUUT,0,0,N,YU,5591
16366,2021-12-31,01:12,Friday,FINCH STATION,SUDP,5,10,S,YU,5983
16367,2021-12-31,01:21,Friday,EGLINTON WEST STATION,PUOPO,3,8,N,YU,6046
16368,2021-12-31,01:37,Friday,SHEPPARD WEST STATION,SUDP,0,0,S,YU,5536
16369,2021-12-31,07:00,Friday,DON MILLS STATION,TUSC,0,0,E,SHP,6146


In [73]:
all_delays.dtypes

Date         datetime64[ns]
Time                 object
Day                  object
Station              object
Code                 object
Min Delay             int64
Min Gap               int64
Bound                object
Line                 object
Vehicle               int64
dtype: object

In [74]:
all_delays.describe(include='all',  # summarize non-numeric columns too
                    datetime_is_numeric=True  # silence a warning about treating dates as categorical data
                   )

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
count,16370,16370,16370,16370,16370,16370.0,16370.0,12119,16318,16370.0
unique,,1383,7,284,173,,,5,17,
top,,22:00,Friday,VAUGHAN MC STATION,SUDP,,,S,YU,
freq,,437,2600,927,2015,,,4216,8880,
mean,2021-07-16 12:47:30.091631104,,,,,3.268418,5.057972,,,3642.871228
min,2021-01-01 00:00:00,,,,,0.0,0.0,,,0.0
25%,2021-04-13 00:00:00,,,,,0.0,0.0,,,0.0
50%,2021-07-24 12:00:00,,,,,0.0,0.0,,,5198.5
75%,2021-10-23 00:00:00,,,,,4.0,8.0,,,5706.0
max,2021-12-31 00:00:00,,,,,348.0,351.0,,,8778.0


In [75]:
all_delays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16370 entries, 0 to 16369
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       16370 non-null  datetime64[ns]
 1   Time       16370 non-null  object        
 2   Day        16370 non-null  object        
 3   Station    16370 non-null  object        
 4   Code       16370 non-null  object        
 5   Min Delay  16370 non-null  int64         
 6   Min Gap    16370 non-null  int64         
 7   Bound      12119 non-null  object        
 8   Line       16318 non-null  object        
 9   Vehicle    16370 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 1.2+ MB


In [76]:
def clean_names(colname):
    return colname.lower().replace(' ', '_')

all_delays = all_delays.rename(columns=clean_names)
all_delays.columns

Index(['date', 'time', 'day', 'station', 'code', 'min_delay', 'min_gap',
       'bound', 'line', 'vehicle'],
      dtype='object')

In [77]:
# mutate in place
# all_delays.rename(columns=clean_names, inplace=True)

In [78]:
all_delays['code'].unique()

array(['MUPAA', 'EUCO', 'EUCD', 'MUIS', 'MUO', 'MRO', 'TUO', 'MUATC',
       'TUMVS', 'TUKEY', 'MUPLB', 'PUSRA', 'MUIR', 'SUDP', 'TRST', 'EUSC',
       'MUSC', 'EUNT', 'TUST', 'MUI', 'MUNOA', 'PUSIS', 'TUOS', 'SUAP',
       'EUTR', 'TUSC', 'TUNIP', 'MUIRS', 'SUPOL', 'SUO', 'SRAP', 'TUNOA',
       'MUPR1', 'EUAC', 'PUTOE', 'EUDO', 'EUOE', 'MUNCA', 'PUMEL',
       'PUMST', 'SUUT', 'ERME', 'EUBO', 'PUSO', 'PUSWZ', 'ERTC', 'MUTO',
       'PUMO', 'MUIE', 'PUTR', 'MUDD', 'EUPI', 'TUATC', 'PUSTS', 'MUSAN',
       'TRO', 'PUTWZ', 'PUSSW', 'TUSUP', 'PUSTC', 'EUO', 'SUG', 'EUME',
       'MUCL', 'PRSO', 'TUDOE', 'EUTRD', 'SUROB', 'SUAE', 'MUD', 'EUTL',
       'TRNIP', 'MRUI', 'PREL', 'EUBK', 'SRDP', 'MUTD', 'SRO', 'SUSA',
       'MRCL', 'TUS', 'EUCA', 'PRSA', 'SUEAS', 'EUECD', 'ERDO', 'EUAL',
       'PUTIJ', 'PUSI', 'EUVE', 'PUTSM', 'EUATC', 'PUOPO', 'PUTIS',
       'ERCO', 'PUTDN', 'MRPAA', 'PUSAC', 'MRTO', 'TUCC', 'TRTC', 'ERCD',
       'TUOPO', 'PUSCR', 'PUCSC', 'EUYRD', 'PUSNT', 'MUESA', 'PUT

In [79]:
code_url = 'https://github.com/amfz/dsi-python-workshop/raw/main/data/ttc-subway-delay-codes.xlsx'
delay_codes = pd.read_excel(code_url, sheet_name=None)
delay_codes

{'SUB':     RMENU CODE                     CODE DESCRIPTION SUB OR SRT
 0         EUAC                     Air Conditioning        SUB
 1         EUAL                  Alternating Current        SUB
 2        EUATC                   ATC RC&S Equipment        SUB
 3         EUBK                               Brakes        SUB
 4         EUBO                                 Body        SUB
 ..         ...                                  ...        ...
 124        TUS     Crew Unable to Maintain Schedule        SUB
 125       TUSC                Operator Overspeeding        SUB
 126      TUSET  Train Controls Improperly Shut Down        SUB
 127       TUST                         Storm Trains        SUB
 128      TUSUP                    Supervisory Error        SUB
 
 [129 rows x 3 columns],
 'SRT':    RMENU CODE                            CODE DESCRIPTION SUB OR SRT
 0        ERAC                            Air Conditioning        SRT
 1        ERBO                                     

In [80]:
delay_codes_df = pd.DataFrame()

for sheet, df in delay_codes.items():
    delay_codes_df = pd.concat([delay_codes_df, df],
                              axis=0,
                              ignore_index=True)
    
delay_codes_df.head()

Unnamed: 0,RMENU CODE,CODE DESCRIPTION,SUB OR SRT
0,EUAC,Air Conditioning,SUB
1,EUAL,Alternating Current,SUB
2,EUATC,ATC RC&S Equipment,SUB
3,EUBK,Brakes,SUB
4,EUBO,Body,SUB


In [81]:
delays_w_reasons = pd.merge(all_delays,
                            delay_codes_df,
                            how='left',
                            left_on='code',
                            right_on='RMENU CODE'
                           )
delays_w_reasons.head()

Unnamed: 0,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,RMENU CODE,CODE DESCRIPTION,SUB OR SRT
0,2021-01-01,00:33,Friday,BLOOR STATION,MUPAA,0,0,N,YU,6046,MUPAA,Passenger Assistance Alarm Activated - No Trou...,SUB
1,2021-01-01,00:39,Friday,SHERBOURNE STATION,EUCO,5,9,E,BD,5250,EUCO,Couplers,SUB
2,2021-01-01,01:07,Friday,KENNEDY BD STATION,EUCD,5,9,E,BD,5249,EUCD,Consequential Delay (2nd Delay Same Fault),SUB
3,2021-01-01,01:41,Friday,ST CLAIR STATION,MUIS,0,0,,YU,0,MUIS,Injured or ill Customer (In Station) - Transpo...,SUB
4,2021-01-01,02:04,Friday,SHEPPARD WEST STATION,MUIS,0,0,,YU,0,MUIS,Injured or ill Customer (In Station) - Transpo...,SUB


In [86]:
# merge is also a DataFrame method
# this does the same thing as the code cell above
dwr2 = all_delays.merge(delay_codes_df,
                        how='left',
                        left_on='code',
                        right_on='RMENU CODE'
                       )
dwr2.head()

Unnamed: 0,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,RMENU CODE,CODE DESCRIPTION,SUB OR SRT
0,2021-01-01,00:33,Friday,BLOOR STATION,MUPAA,0,0,N,YU,6046,MUPAA,Passenger Assistance Alarm Activated - No Trou...,SUB
1,2021-01-01,00:39,Friday,SHERBOURNE STATION,EUCO,5,9,E,BD,5250,EUCO,Couplers,SUB
2,2021-01-01,01:07,Friday,KENNEDY BD STATION,EUCD,5,9,E,BD,5249,EUCD,Consequential Delay (2nd Delay Same Fault),SUB
3,2021-01-01,01:41,Friday,ST CLAIR STATION,MUIS,0,0,,YU,0,MUIS,Injured or ill Customer (In Station) - Transpo...,SUB
4,2021-01-01,02:04,Friday,SHEPPARD WEST STATION,MUIS,0,0,,YU,0,MUIS,Injured or ill Customer (In Station) - Transpo...,SUB


In [82]:
delays_w_reasons = delays_w_reasons.rename(columns=clean_names)
delays_w_reasons.columns

Index(['date', 'time', 'day', 'station', 'code', 'min_delay', 'min_gap',
       'bound', 'line', 'vehicle', 'rmenu_code', 'code_description',
       'sub_or_srt'],
      dtype='object')

In [83]:
delays_w_reasons[['code', 'code_description']].describe(include='all')

Unnamed: 0,code,code_description
count,16370,16048
unique,173,129
top,SUDP,Disorderly Patron
freq,2015,2051


In [84]:
delays_w_reasons = delays_w_reasons.drop(columns=['rmenu_code', 'sub_or_srt'])
delays_w_reasons.tail()

Unnamed: 0,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_description
16365,2021-12-31,01:10,Friday,MUSEUM STATION,SUUT,0,0,N,YU,5591,Unauthorized at Track Level
16366,2021-12-31,01:12,Friday,FINCH STATION,SUDP,5,10,S,YU,5983,Disorderly Patron
16367,2021-12-31,01:21,Friday,EGLINTON WEST STATION,PUOPO,3,8,N,YU,6046,OPTO (COMMS) Train Door Monitoring
16368,2021-12-31,01:37,Friday,SHEPPARD WEST STATION,SUDP,0,0,S,YU,5536,Disorderly Patron
16369,2021-12-31,07:00,Friday,DON MILLS STATION,TUSC,0,0,E,SHP,6146,Operator Overspeeding


In [85]:
delays_w_reasons.to_excel('delays_w_reasons.xlsx', index=False)

delays_w_reasons.to_csv('delays_w_reasons.csv', index=False)