# Python for Data Analysis
    
    AUTHOR: Dr. Wes McKinney 

### Chapter 7: Data Cleaning and Preparation
### **7.2 Data TRansformation**

# Discretization and Binnig

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
import numpy as np
import pandas as pd
np.random.seed(12345)

In [14]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [15]:
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [16]:
print(age_categories.codes)
print(age_categories.categories)
print(age_categories.categories[0])
pd.value_counts(age_categories)

[0 0 0 1 0 0 2 1 3 2 2 1]
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
(18, 25]


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

The object pandas returns is a special Categorical object. The output you see describes the bins computed by `pandas.cut`. Each bin is identified by a special (unique to pandas) interval value type containing the lower and upper limit of each bin

In [17]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

You can override the default interval-based bin labeling by passing a list or array to the `labels` option

In [18]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

If you pass an integer number of bins to `pandas.cut` instead of explicit bin edges, it will compute equal-length bins based on the minimum and maximum values in the data. The `precision=2` option limits the decimal precision to two digits

In [19]:
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=1)

[(0.7, 1.0], (0.2, 0.5], (0.007, 0.2], (0.007, 0.2], (0.5, 0.7], ..., (0.5, 0.7], (0.7, 1.0], (0.7, 1.0], (0.7, 1.0], (0.5, 0.7]]
Length: 20
Categories (4, interval[float64, right]): [(0.007, 0.2] < (0.2, 0.5] < (0.5, 0.7] < (0.7, 1.0]]

`pandas.qcut`, bins the data based on sample quantiles. Depending on the distribution of the data, using `pandas.cut` will not usually result in each bin having the same number of data points. Since `pandas.qcut` uses sample quantiles instead, you will obtain roughly **equally sized bins**.

In [20]:
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)
print(quartiles)
pd.value_counts(quartiles)

[(0.63, 3.93], (0.63, 3.93], (-2.96, -0.69], (-0.69, -0.017], (0.63, 3.93], ..., (-0.017, 0.63], (0.63, 3.93], (-0.69, -0.017], (-0.69, -0.017], (-0.017, 0.63]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.96, -0.69] < (-0.69, -0.017] < (-0.017, 0.63] < (0.63, 3.93]]


(-2.96, -0.69]     250
(-0.69, -0.017]    250
(-0.017, 0.63]     250
(0.63, 3.93]       250
Name: count, dtype: int64

Similar to `pandas.cut`, you can pass your own quantiles (numbers between 0 and 1, inclusive).

In [19]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-2.9499999999999997, -1.191]    100
(-1.191, -0.0171]                400
(-0.0171, 1.297]                 400
(1.297, 3.928]                   100
dtype: int64