In [5]:
from os.path import basename, exists
def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print(f"Downloaded: {local}")    

# 4 Cumulative Distribution Functions
---

- **CDF** is useful comparing percentiles, and especially useful for compaing distributions.

In [1]:
import empiricaldist
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from thinkstats import decorate

## 4.1 Percentile and Percentile Ranks
- In context to the results in the form of **Percentile Rank**, The percentile rank is the **percentage of people who got the same score as you or lower.**
- ***90th percentile*** --> did as well as or better than 90% fo the people who took the exam.

**EXAMPLE:**

In [4]:
download("https://github.com/AllenDowney/ThinkStats/raw/v3/nb/relay.py")
download("https://github.com/AllenDowney/ThinkStats/raw/v3/data/Apr25_27thAn_set1.shtml")

Downloaded + relay.py
Downloaded + Apr25_27thAn_set1.shtml


In [13]:
# read the results of relay
from relay import read_results

results = read_results()
results.head()

Unnamed: 0,Place,Div/Tot,Division,Guntime,Nettime,Min/Mile,MPH
0,1,1/362,M2039,30:43,30:42,4:57,12.121212
1,2,2/362,M2039,31:36,31:36,5:06,11.764706
2,3,3/362,M2039,31:42,31:42,5:07,11.726384
3,4,4/362,M2039,32:28,32:27,5:14,11.464968
4,5,5/362,M2039,32:52,32:52,5:18,11.320755


In [15]:
# 1633 runners who finished the race
results.count()

Place       1633
Div/Tot     1633
Division    1633
Guntime     1633
Nettime     1633
Min/Mile    1633
MPH         1633
dtype: int64

In [17]:
# get the speed of each member who participated
# in the race
speeds = results["MPH"].values

In [22]:
# find the required speed of target member
my_result = results.query("Nettime == '42:44'")
my_result

Unnamed: 0,Place,Div/Tot,Division,Guntime,Nettime,Min/Mile,MPH
96,97,26/256,M4049,42:48,42:44,6:53,8.716707


In [24]:
# as we know the position of us, we can retrieve
# the speed
my_speed = speeds[96]

In [25]:
# count the number of runners at my speed or slower
(speeds <= my_speed).sum()

np.int64(1537)

- The formula for for percentile is given by:
  <div align="center"><samp>
      ( Number of values below x / total number of values ) * 100
  </samp></div>

In [26]:
# use mean to compute the percentage of runners at my speed or slower
(speeds <= my_speed).mean()*100

np.float64(94.12124923453766)

- The result is my percentile rank in the field, which was about 94%

In [27]:
# compute the rank of a particular value in sequence of values.
def percentile_rank(x, seq):
    """Percentile rank of x.

    x: value
    seq: sequence of values

    returns: percentile rank 0-100
    """
    return (seq <= x).mean()*100

- The **Division** Column:
    - The division each runner was in, identified by gender and age range
    - ex: i was in M4049 which includes Male runners from age 40 to 49  

In [35]:
# select the rows for people in my division and extract
# their speeds
my_division = results.query("Division == 'M4049'")
my_division_speed = my_division["MPH"].values

In [37]:
# compute percentile in my division: M4049
percentile_rank(my_speed, my_division_speed)

np.float64(90.234375)

- Percentile is also given by:
```
p = (nth percentile/100) * total number of values in the list
```

- if we are given a percentile rank,

In [38]:
def percentile(p, seq):
    n = len(seq) # number of values in seq
    i = (1 - p / 100) * (n + 1)
    return seq[round(i)]

In [39]:
percentile(90, my_division_speed)

np.float64(8.591885441527447)

- In my division, 90th percentile was about 8.6 MPH 

- Now, some years after I ran that race, I am in the M5059 division
- how fast I would have to run to have the same percentile rank in my new division?

- **ANSWER: convert my percentile rank in the M4049 division, which is about 90.2%, to a speed in the M5059 division.**

In [41]:
next_division = results.query("Division == 'M5059'")
next_division_speeds = next_division["MPH"].values

percentile(90.2, next_division_speeds)

np.float64(8.017817371937639)

In [42]:
# find the person with the same percentile rank as me 
# ran just over 8 mph

In [43]:
next_division.query("MPH > 8.01").tail(1)

Unnamed: 0,Place,Div/Tot,Division,Guntime,Nettime,Min/Mile,MPH
222,223,18/171,M5059,46:30,46:25,7:29,8.017817
