When dealing with continuous numeric data, it is often helpful to bin the data into multiple buckets for further analysis. There are several different terms for binning including bucketing, discrete binning, discretization or quantization. Pandas supports these approaches using the cut and qcut functions.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
ages=[20,22,25,27,21,37,31,61,45,32,32,67,82,42,35,18]
bins=[18,25,35,60,100]
cates=pd.cut(ages,bins)
cates

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (60.0, 100.0], (60.0, 100.0], (35.0, 60.0], (25.0, 35.0], NaN]
Length: 16
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [3]:
pd.cut(ages,bins,labels=['Youth','YoungAdults','MiddleAged','Seniors'])

['Youth', 'Youth', 'Youth', 'YoungAdults', 'Youth', ..., 'Seniors', 'Seniors', 'MiddleAged', 'YoungAdults', NaN]
Length: 16
Categories (4, object): ['Youth' < 'YoungAdults' < 'MiddleAged' < 'Seniors']

In [4]:
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [60, 100), [60, 100), [35, 60), [35, 60), [18, 25)]
Length: 16
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

Another example.

In [5]:
np.random.seed(8)
data=np.random.rand(20)
pd.cut(data,4,precision=2)

[(0.73, 0.97], (0.73, 0.97], (0.73, 0.97], (0.49, 0.73], (0.01, 0.25], ..., (0.25, 0.49], (0.25, 0.49], (0.73, 0.97], (0.25, 0.49], (0.01, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.01, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

In [6]:
np.random.seed(0)
data=np.random.randn(1000)
qcate=pd.qcut(data,4)
pd.value_counts(qcate)

(0.607, 2.759]                   250
(-0.058, 0.607]                  250
(-0.698, -0.058]                 250
(-3.0469999999999997, -0.698]    250
dtype: int64

In [7]:
pd.value_counts(pd.qcut(data,[0,0.1,0.5,0.9,1]))

(-0.058, 1.232]                  400
(-1.299, -0.058]                 400
(1.232, 2.759]                   100
(-3.0469999999999997, -1.299]    100
dtype: int64