In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

`pd.cut()` 
- To seperate the array elements into different bins
- Mainly used to perform statistical analysis on scalar data
- Syntax: 
    -`pd.cut(x, bins, right = True, labels = None, retbins = False, precision = 3, include_lowest = False, duplicates = "raise")`
- Parameters: 
    - x: the input array. Must be 1-dimentional.
    - bins: defines the bin edge for the segmentation
    - right: indicates whether bins includes the rightmost edge or not. 
    - labels: specifies the labels for the returned bins. 
    - retbins: whether to return the bins or not. Useful when bins is provided as a scalar. 

In [None]:
numbers = pd.Series([1,2,3,4,5,6,7,8,9])
numbers

### Bins

In [None]:
pd.cut(numbers, 4)

In [None]:
pd.cut(numbers, bins = [1,5,9])

### right = True (default) vs. right = False

In [None]:
pd.cut(numbers, bins = [1,5,9])

In [None]:
pd.cut(numbers, bins = [1,5,9], right = False)

### Labels

In [None]:
pd.cut(numbers, 2)

In [None]:
pd.cut(numbers, 2, labels = ['< 5','>=5'])

In [None]:
pd.cut(numbers, 4, labels = False)

### retbins

In [None]:
pd.cut(numbers, 2)

In [None]:
pd.cut(numbers, 2, retbins = True)

### Precision

In [None]:
pd.cut(numbers, 5)

In [None]:
pd.cut(numbers, 5, precision = 0)

### Include_lowest

In [None]:
pd.cut(numbers, [1,5,9])

In [None]:
pd.cut(numbers, [1,5,9], include_lowest = True)

### duplicates

### `pd.factorize()` to encode the objects as an enumerated type or categorical varibale
* Parameters:
    1. values: 1D sequence
    2. sort: sort uniques and shuffle labels
    3. na_sentinel: mark missing values as 'not found'
    4. size_hint
* Returns:
    1. codes
    2. uniques

In [4]:
codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])

codes, uniques

(array([0, 0, 1, 2, 0]), array(['b', 'a', 'c'], dtype=object))

In [5]:
# With sort=True, the uniques will be sorted
# Codes will be shuffled so that the relatinship is the maintained

codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)

uniques, codes

(array(['a', 'b', 'c'], dtype=object), array([1, 1, 0, 2, 1]))

In [6]:
# Missing values are indicated in codes with na_sentinel(-1 by default)
# Missing values are never included in uniques

codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])

uniques, codes

(array(['b', 'a', 'c'], dtype=object), array([ 0, -1,  1,  2,  0]))