# Timing tests between loop and vectorized functions

In [1]:
import pandas as pd
import numpy as np

In [2]:
baseball_df = pd.read_csv('data/baseball.csv')

In [3]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

## Here's the for loop with loc and iloc
### (Usually returns ~ 50-100 ms ± 5 ms per loop)

In [4]:
%%timeit

win_percs_list = []

for i in range(len(baseball_df)):
    row = baseball_df.iloc[i]

    wins = row['W']
    games_played = row['G']

    win_perc = calc_win_perc(wins, games_played)

    win_percs_list.append(win_perc)

baseball_df['WP'] = win_percs_list

95.8 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## And here's the numpy version
### (Usually returns ~50-150 µs ± 500 ns per loop  -- that's MICROseconds, NOT milliseconds)

In [5]:
%%timeit

win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)
baseball_df['WP'] = win_percs_np

128 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


There are 1,000 microseconds (µs) in 1 millisecond (ms), so:

In [9]:
def ms_micro_comp(milli, micro):
    ms = milli*1000
    comp = ms/micro
    print(f'The Numpy function was {comp} times faster than the old fashioned pandas loop/loc version.')

ms_micro_comp(95.8, 128)

The Numpy function was 748.4375 times faster than the old fashioned pandas loop/loc version.


### I've been getting times between 700 and 1500x faster with Numpy.