In [None]:
# statisitcs is the mathemeatics and techniques with which we understand data

In [None]:
from typing import List
from collections import Counter
from scratch.linear_algebra import sum_of_squares

# Chapter 5: Statistics – *Data Science from Scratch*

Statistics is the backbone of data science. This chapter introduces essential concepts that help you **describe**, **summarize**, and **understand** your data.

---

## 📊 Key Topics

### 1. Measures of Central Tendency

- **Mean**: The average of the numbers.
- **Median**: The middle value when the data is sorted.
- **Mode**: The most frequent value(s).

### 2. Measures of Dispersion

- **Range**: Difference between max and min.
- **Variance**: Average squared deviation from the mean.
- **Standard Deviation**: Square root of the variance; easier to interpret.

### 3. Interquartile Range (IQR)

- Measures the spread of the **middle 50%** of your data.
- Less sensitive to outliers than the range.

### 4. Correlation

- Tells how **two variables move together**.
- Ranges from **-1 to 1**:
  - `+1`: Perfect positive relationship
  - `0`: No relationship
  - `-1`: Perfect negative relationship

### 5. Covariance

- Also measures how two variables move together.
- **Not normalized**, so it's harder to interpret than correlation.

### 6. Outliers

- Data points that are **far from the rest**.
- Can **distort** mean, variance, and correlation.
- Important to detect and handle properly.

---

## 🧠 Simple Takeaways

- Use **mean** for typical value when data is symmetric.
- Use **median** if the data is skewed or has outliers.
- **Standard deviation** shows how spread out your data is.
- **Correlation** is great for spotting relationships.
- Always be on the lookout for **outliers** — they can sneakily mess things up!

---

## ✅ Next Steps

- Try implementing each concept in Python.
- Use small lists of numbers to experiment and visualize.
- Think about how each statistic would help in a real data project.


In [4]:
# Central Tendencies

def mean(xs: List[float]) -> float:
    return sum(xs) / len(xs)

"""
same as writing..

def mean(xs):
    return sum(xs) / len(xs)
""" 

'\nsame as writing..\n\ndef mean(xs):\n    return sum(xs) / len(xs)\n'

In [None]:
# as you add more points the mean shifts around
# mean is dependent on every point

# median is the middle most value, if the number of data points is 
# odd or the avg of the two middle most values if even 
#   --> 

In [5]:
def _median_odd(xs: List[float]) -> float:
    """If lens(xs) is odd, the median is the middle element"""
    return sorted(xs)[len(xs) // 2]

## 🔢 What is Integer Division?

**Integer division** divides one number by another and returns the **whole number part only** (rounds down).

In Python, we use the `//` operator:

```python
7 // 2   # Output: 3
9 // 4   # Output: 2

In [6]:
def _median_even(xs: List[float]) -> float:
    """If lens(xs) is even, the mediam is average of the middle two elements"""
    sorted_xs = sorted(xs)
    hi_midpoint = lens(xs) // 2
    return (sortedxs[hi_midpoint - 1] + sorted_xs[hi_midpoint]) / 2

In [7]:
def median(v: List[float]) -> float:
    return _median_even(v) if len(v) % 2 == 0 else _median_odd(v)


### If we have n data points and one of them increases by some small amount e then necessarily the mean will increase by e/n

### because of outliers the mean often give a misleading picture

In [8]:
# Generalization of the median is the quantile. The median is the 50% quantile

def quantile(xs: List[float], p: float) -> float:
        """returns the pth-percentile value in x"""
        p_index = int(p * len(xs))
        return sorted(xs)[p_index]

## 🔍 Explaining: `p_index = int(p * len(xs))`

This line finds the **index** of the value at the p-th percentile.

### Step-by-step:

1. `len(xs)` — Count how many values are in the list.
2. `p * len(xs)` — Find the position that is p% of the way through the list.
3. `int(...)` — Convert to a whole number (truncate the decimal) so we can use it as a list index.

### ✅ Example

```python
xs = [10, 20, 30, 40, 50]
p = 0.25

p_index = int(0.25 * 5)  # → int(1.25) → 1
sorted(xs)[1] = 20


In [9]:
def mode(x: List[float]) -> List[float]:
    """Returns a list, since there might be more than one mode"""
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items() if count == max_count]


# Dispersion

Dispersion refers to measures of how spread out our data is. Range and Variance

In [11]:
def data_range(xs: List[float]) -> float:
    return max(xs) - min(xs)


In [None]:
def de_mean(xs: List[float]) -> List[float]:
    """Translate xs by subtracting its mean (so the result has mean 0)"""
    x_bar = mean(xs)
    return [x - x_bar for x in xs]

def variance(xs: List[float]) -> float:
    """Almost the average squared deviation from the mean"""
    assert len(xs) >= 2, "variance requires at least two elements"

    n = len(xs)
    deviations = de_mean(xs)
    return sum_of_squares(deviations) / (n - 1)
