In [1]:
import numpy as np
import pandas as pd

Methods for manipulating a `Series` or `DataFrame` to altar the representation of the data for further data processing or data summarization:
- `pivot()` and `pivot_table()`
- `stack()` and `unstack()`
- `melt()` and `wide_to_long()`
- `get_dummies()` and `from_dummies()`
- `explode()`
- `crosstab()`
- `cut()`
- `factorize()`

# `pivot()` and `pivot_table()`

## `pivot()`

- **Record** or **wide** format -- One row per subject
- **Stacked** or **long** format -- Several rows per subject 

In [2]:
# Create a DataFrame
# Each row represents an observation
# date -- Date of observation
# variable -- Variable being observed
# value -- Value of the variable
data = {
    'value': range(12),
    'variable': ['A'] * 3 + ['B'] * 3 + ['C'] * 3 + ['D'] * 3,
    'date': pd.to_datetime(['2024-11-27', '2024-11-28', '2024-11-29'] * 4)
}

df = pd.DataFrame(data)
df.index.name = 'observation'
df

Unnamed: 0_level_0,value,variable,date
observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,A,2024-11-27
1,1,A,2024-11-28
2,2,A,2024-11-29
3,3,B,2024-11-27
4,4,B,2024-11-28
5,5,B,2024-11-29
6,6,C,2024-11-27
7,7,C,2024-11-28
8,8,C,2024-11-29
9,9,D,2024-11-27


In [3]:
# Pivot the DataFrame
# Each row represents a date that observations were taken
# Each column represents the observed values of a variable
df.pivot(index='date', columns='variable', values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-11-27,0,3,6,9
2024-11-28,1,4,7,10
2024-11-29,2,5,8,11


In [4]:
# Add a second column of values to the DataFrame
df['value2'] = df['value'] * 2
df

Unnamed: 0_level_0,value,variable,date,value2
observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,A,2024-11-27,0
1,1,A,2024-11-28,2
2,2,A,2024-11-29,4
3,3,B,2024-11-27,6
4,4,B,2024-11-28,8
5,5,B,2024-11-29,10
6,6,C,2024-11-27,12
7,7,C,2024-11-28,14
8,8,C,2024-11-29,16
9,9,D,2024-11-27,18


In [5]:
# Pivot the DataFrame without specifying values
# The resulting DataFrame has a MultiIndex for the columns
df.pivot(index='date', columns='variable')

Unnamed: 0_level_0,value,value,value,value,value2,value2,value2,value2
variable,A,B,C,D,A,B,C,D
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2024-11-27,0,3,6,9,0,6,12,18
2024-11-28,1,4,7,10,2,8,14,20
2024-11-29,2,5,8,11,4,10,16,22


In [6]:
# Select observations of value2
df.pivot(index='date', columns='variable')['value2']

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-11-27,0,6,12,18
2024-11-28,2,8,14,20
2024-11-29,4,10,16,22


## `pivot_table()`

In [7]:
# Create a DataFrame
import datetime
df = pd.DataFrame({
    'A': ['one', 'one', 'two', 'three'] * 6,
    'B': ['X', 'Y', 'Z'] * 8,
    'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
    'D': np.random.randn(24),
    'E': np.random.randn(24),
    'F': [datetime.datetime(2024, i, 1) for i in range(1, 13)] + [datetime.datetime(2024, i, 15) for i in range(1, 13)]
})

df

Unnamed: 0,A,B,C,D,E,F
0,one,X,foo,0.571855,0.372806,2024-01-01
1,one,Y,foo,-1.563345,-0.294035,2024-02-01
2,two,Z,foo,-0.627663,0.203952,2024-03-01
3,three,X,bar,-1.789609,-0.133167,2024-04-01
4,one,Y,bar,-0.134518,-0.103151,2024-05-01
5,one,Z,bar,-0.468642,-0.319142,2024-06-01
6,two,X,foo,0.172583,-0.595662,2024-07-01
7,three,Y,foo,0.700336,-1.5828,2024-08-01
8,one,Z,foo,0.705083,-0.787777,2024-09-01
9,one,X,bar,0.234179,-2.145262,2024-10-01


In [8]:
# Pivot DataFrame to be a table that is MultiIndexed by ['A', 'B'],
# with columns of ['C'] and values of ['D'].
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])


Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,X,-0.097333,1.441725
one,Y,0.429542,-0.888762
one,Z,-0.102027,-0.418283
three,X,-0.96388,
three,Y,,0.658183
three,Z,-0.663646,
two,X,,0.443312
two,Y,0.396651,
two,Z,,-0.873909


In [9]:
# Pivot DataFrame to be a table that is indexed by ['B']
# with columns of ['A', 'C'] and values of ['D', 'E'],
# where values are aggregated by sum
pd.pivot_table(
    df,
    values=['D', 'E'],
    index=['B'],
    columns=['A', 'C'],
    aggfunc='sum'
)

Unnamed: 0_level_0,D,D,D,D,D,D,E,E,E,E,E,E
A,one,one,three,three,two,two,one,one,three,three,two,two
C,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo,bar,foo
B,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
X,-0.194667,2.88345,-1.927761,,,0.886624,-3.275444,0.63887,0.763517,,,-1.098814
Y,0.859083,-1.777525,,1.316366,0.793302,,-0.045427,-0.255086,,-3.013193,0.931108,
Z,-0.204054,-0.836566,-1.327291,,,-1.747818,-0.108219,-0.551637,-1.128856,,,0.946144


In [10]:
# Pivot DataFrame to be a table indexed by ['B', 'C'],
# with columns of ['A'] and values of ['E'],
# where values are aggregated by sum and mean
pd.pivot_table(
    df,
    values='E',
    index=['B', 'C'],
    columns=['A'],
    aggfunc=['sum', 'mean']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,mean,mean,mean
Unnamed: 0_level_1,A,one,three,two,one,three,two
B,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
X,bar,-3.275444,0.763517,,-1.637722,0.381758,
X,foo,0.63887,,-1.098814,0.319435,,-0.549407
Y,bar,-0.045427,,0.931108,-0.022714,,0.465554
Y,foo,-0.255086,-3.013193,,-0.127543,-1.506596,
Z,bar,-0.108219,-1.128856,,-0.05411,-0.564428,
Z,foo,-0.551637,,0.946144,-0.275819,,0.473072


In [11]:
# Pivot DataFrame to be a table indexed by ['A', 'B'] with columns ['C'],
# using all values
pd.pivot_table(df, index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E,F,F
Unnamed: 0_level_1,C,bar,foo,bar,foo,bar,foo
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
one,X,-0.097333,1.441725,-1.637722,0.319435,2024-10-08,2024-01-08
one,Y,0.429542,-0.888762,-0.022714,-0.127543,2024-05-08,2024-02-08
one,Z,-0.102027,-0.418283,-0.05411,-0.275819,2024-06-08,2024-09-08
three,X,-0.96388,,0.381758,,2024-04-08,NaT
three,Y,,0.658183,,-1.506596,NaT,2024-08-08
three,Z,-0.663646,,-0.564428,,2024-12-08,NaT
two,X,,0.443312,,-0.549407,NaT,2024-07-08
two,Y,0.396651,,0.465554,,2024-11-08,NaT
two,Z,,-0.873909,,0.473072,NaT,2024-03-08


In [None]:
# Pivot DataFrame to be indexed by column ['F'] of datetime objects,
# using a Grouper to group by month.
# Values are aggregated by mean by default
pd.pivot_table(
    df, 
    values='D', 
    index=pd.Grouper(key='F', freq='ME'),
    columns='C',
)

C,bar,foo
F,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-31,,1.441725
2024-02-29,,-0.888762
2024-03-31,,-0.873909
2024-04-30,-0.96388,
2024-05-31,0.429542,
2024-06-30,-0.102027,
2024-07-31,,0.443312
2024-08-31,,0.658183
2024-09-30,,-0.418283
2024-10-31,-0.097333,


### Adding margins

In [19]:
# Add row and column labeled "all" that aggregaates across groups
pd.pivot_table(
    df, 
    index=['A', 'B'], 
    columns=['C'], 
    values=['D', 'E'],
    margins=True
)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D,E,E,E
Unnamed: 0_level_1,C,bar,foo,All,bar,foo,All
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
one,X,-0.097333,1.441725,0.672196,-1.637722,0.319435,-0.659144
one,Y,0.429542,-0.888762,-0.22961,-0.022714,-0.127543,-0.075128
one,Z,-0.102027,-0.418283,-0.260155,-0.05411,-0.275819,-0.164964
three,X,-0.96388,,-0.96388,0.381758,,0.381758
three,Y,,0.658183,0.658183,,-1.506596,-1.506596
three,Z,-0.663646,,-0.663646,-0.564428,,-0.564428
two,X,,0.443312,0.443312,,-0.549407,-0.549407
two,Y,0.396651,,0.396651,0.465554,,0.465554
two,Z,,-0.873909,-0.873909,,0.473072,0.473072
All,,-0.166782,0.060378,-0.053202,-0.23861,-0.27781,-0.25821


In [20]:
# Stack pivot table to display pivoted DataFrame as having a MultiIndex
pd.pivot_table(
    df, 
    index=['A', 'B'], 
    columns=['C'], 
    values=['D', 'E'],
).stack(future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,D,E
A,B,C,Unnamed: 3_level_1,Unnamed: 4_level_1
one,X,bar,-0.097333,-1.637722
one,X,foo,1.441725,0.319435
one,Y,bar,0.429542,-0.022714
one,Y,foo,-0.888762,-0.127543
one,Z,bar,-0.102027,-0.05411
one,Z,foo,-0.418283,-0.275819
three,X,bar,-0.96388,0.381758
three,X,foo,,
three,Y,bar,,
three,Y,foo,0.658183,-1.506596


# `stack()` and `unstack()`

In [23]:
# Create MultiIndex DataFrame
df = pd.DataFrame(
    np.random.randn(8, 2),
    index=pd.MultiIndex.from_product(
        [['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
        names=['first', 'second']
    ),
    columns=['A', 'B']
)

df


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.863187,0.032444
bar,two,-1.801852,-0.997141
baz,one,1.251786,0.262626
baz,two,-0.993096,-1.973522
foo,one,1.454179,0.604522
foo,two,0.409728,-0.584142
qux,one,1.316258,0.613475
qux,two,0.317185,-1.268475


In [24]:
# Create stacked DataFrame by stacking columns into lowest level of MultiIndex
stacked = df.stack()
stacked

first  second   
bar    one     A    0.863187
               B    0.032444
       two     A   -1.801852
               B   -0.997141
baz    one     A    1.251786
               B    0.262626
       two     A   -0.993096
               B   -1.973522
foo    one     A    1.454179
               B    0.604522
       two     A    0.409728
               B   -0.584142
qux    one     A    1.316258
               B    0.613475
       two     A    0.317185
               B   -1.268475
dtype: float64

In [25]:
# Unstack DataFrame to move the innermost level of the index to the columns
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.863187,0.032444
bar,two,-1.801852,-0.997141
baz,one,1.251786,0.262626
baz,two,-0.993096,-1.973522
foo,one,1.454179,0.604522
foo,two,0.409728,-0.584142
qux,one,1.316258,0.613475
qux,two,0.317185,-1.268475


In [26]:
# Unstack DataFrame to move middle level of the index to the columns
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.863187,-1.801852
bar,B,0.032444,-0.997141
baz,A,1.251786,-0.993096
baz,B,0.262626,-1.973522
foo,A,1.454179,0.409728
foo,B,0.604522,-0.584142
qux,A,1.316258,0.317185
qux,B,0.613475,-1.268475


In [28]:
# Unstack DataFrame to move middle level of the index to the columns
stacked.unstack('second')

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.863187,-1.801852
bar,B,0.032444,-0.997141
baz,A,1.251786,-0.993096
baz,B,0.262626,-1.973522
foo,A,1.454179,0.409728
foo,B,0.604522,-0.584142
qux,A,1.316258,0.317185
qux,B,0.613475,-1.268475


In [27]:
# Unstack DataFrame to move the outermost level of the index to the columns
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,A,0.863187,1.251786,1.454179,1.316258
one,B,0.032444,0.262626,0.604522,0.613475
two,A,-1.801852,-0.993096,0.409728,0.317185
two,B,-0.997141,-1.973522,-0.584142,-1.268475


## Multiple levels

MultiIndex([('A', 'cat',  'long'),
            ('B', 'cat',  'long'),
            ('A', 'dog', 'short'),
            ('B', 'dog', 'short')],
           )

In [37]:
# Create DataFrame
df = pd.DataFrame(
    np.random.randn(4, 4),
    columns=pd.MultiIndex.from_tuples(
        [
            ('A', 'cat', 'long'),
            ('B', 'cat', 'long'),
            ('A', 'dog', 'short'),
            ('B', 'dog', 'short')
        ],
        names=['exp', 'animal','hair_length']
    )
)

df

exp,A,B,A,B
animal,cat,cat,dog,dog
hair_length,long,long,short,short
0,0.949264,0.518083,-0.661621,1.494453
1,-1.209869,1.375626,-2.254005,-1.22245
2,-0.315834,0.612376,0.770777,-1.079258
3,1.151227,1.162252,0.446016,0.595193


In [38]:
# Stack multiple levels at once
df.stack(level=['animal', 'hair_length'], future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,0.949264,0.518083
0,dog,short,-0.661621,1.494453
1,cat,long,-1.209869,1.375626
1,dog,short,-2.254005,-1.22245
2,cat,long,-0.315834,0.612376
2,dog,short,0.770777,-1.079258
3,cat,long,1.151227,1.162252
3,dog,short,0.446016,0.595193


In [39]:
# Stack multiple levels at once
df.stack(level=[1, 2], future_stack=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,0.949264,0.518083
0,dog,short,-0.661621,1.494453
1,cat,long,-1.209869,1.375626
1,dog,short,-2.254005,-1.22245
2,cat,long,-0.315834,0.612376
2,dog,short,0.770777,-1.079258
3,cat,long,1.151227,1.162252
3,dog,short,0.446016,0.595193


## Missing data

In [40]:
# Create DataFrame
df = pd.DataFrame(
    np.random.randn(8, 4),
    index=pd.MultiIndex.from_product(
        [['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
        names=['first', 'second']
    ),
    columns=pd.MultiIndex.from_product(
        [['A', 'B'], ['cat', 'dog']],
        names=['exp', 'animal']
    )
)

df

Unnamed: 0_level_0,exp,A,A,B,B
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,0.439069,-0.952797,1.003088,-0.177124
bar,two,0.150521,0.420272,1.240817,0.542147
baz,one,1.535223,-0.734362,1.551807,-1.038692
baz,two,0.070568,-0.01064,-0.971924,-0.786863
foo,one,-1.766827,2.799618,0.212012,-1.384637
foo,two,1.355349,-2.092406,-0.925334,-0.567963
qux,one,-0.423737,-0.825685,2.562328,-1.224932
qux,two,0.965143,-0.283417,0.771359,0.34246


In [None]:
# Select subset of DataFrame that leaves out data at some index levels
df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
df3

Unnamed: 0_level_0,exp,A,B
Unnamed: 0_level_1,animal,dog,cat
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,-0.952797,1.003088
bar,two,0.420272,1.240817
foo,one,2.799618,0.212012
qux,two,-0.283417,0.771359


In [43]:
# Unstack subset of DataFrame
# Notice that there is missing data
df3.unstack()

exp,A,A,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,-0.952797,0.420272,1.003088,1.240817
foo,2.799618,,0.212012,
qux,,-0.283417,,0.771359


In [44]:
# Unstack subset of DataFrame, filling missing data with specified value
df3.unstack(fill_value=-1)

exp,A,A,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,-0.952797,0.420272,1.003088,1.240817
foo,2.799618,-1.0,0.212012,-1.0
qux,-1.0,-0.283417,-1.0,0.771359


# `melt()` and `wide_to_long()`

In [45]:
# Create DataFrame
df = pd.DataFrame({
    'first': ['John', 'Mary'],
    'last': ['Doe', 'Bo'],
    'height': [5.5, 6.0],
    'weight': [130, 150]
})

df

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


In [None]:
# Melt DataFrame so that the values of height and weight are in a single 'value' column,
# and an additional column is added to specify what these values are
df.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [49]:
# Melt DataFrame, specifying name of the new column used to store the values
df.melt(id_vars=['first', 'last'], value_name='measurement')

Unnamed: 0,first,last,variable,measurement
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [47]:
# Melt DataFrame, specifying name of the new column used to identify the value column
df.melt(id_vars=['first', 'last'], var_name='quantity')

Unnamed: 0,first,last,quantity,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [50]:
# Create MultiIndex DataFrame
df = pd.DataFrame(
    {
        'first': ['John', 'Mary'],
        'last': ['Doe', 'Bo'],
        'height': [5.5, 6.0],
        'weight': [130, 150]
    },
    index=pd.MultiIndex.from_product(
        [['person'], ['A', 'B']],
    )
)

df

Unnamed: 0,Unnamed: 1,first,last,height,weight
person,A,John,Doe,5.5,130
person,B,Mary,Bo,6.0,150


In [52]:
# Melt DataFrame
# Notice that the index is dropped
df.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [53]:
# Melt DataFrame, keeping index
df.melt(id_vars=['first', 'last'], ignore_index=False)

Unnamed: 0,Unnamed: 1,first,last,variable,value
person,A,John,Doe,height,5.5
person,B,Mary,Bo,height,6.0
person,A,John,Doe,weight,130.0
person,B,Mary,Bo,weight,150.0


In [54]:
# Create DataFrame
dft = pd.DataFrame({
    'A1970': {0: 'a', 1: 'b', 2: 'c'},
    'A1980': {0: 'd', 1: 'e', 2: 'f'},
    'B1970': {0: 2.5, 1: 1.2, 2: 0.7},
    'B1980': {0: 3.2, 1: 1.3, 2: 0.1},
    'X': dict(zip(range(3), np.random.randn(3)))
})

# Create column from Index
dft['id'] = dft.index

dft

Unnamed: 0,A1970,A1980,B1970,B1980,X,id
0,a,d,2.5,3.2,1.323772,0
1,b,e,1.2,1.3,1.802746,1
2,c,f,0.7,0.1,0.189574,2


In [55]:
# Transform DataFrame to long format
# Look for columns with 'A' and 'B'
#   Create index level called 'year' using the column names following the prefixes as values
#   Create columns 'A' and 'B' using the values of the columns with the prefixes
# Create index top-level called 'id' using the index
pd.wide_to_long(dft, stubnames=['A', 'B'], i='id', j='year')

Unnamed: 0_level_0,Unnamed: 1_level_0,X,A,B
id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1970,1.323772,a,2.5
1,1970,1.802746,b,1.2
2,1970,0.189574,c,0.7
0,1980,1.323772,d,3.2
1,1980,1.802746,e,1.3
2,1980,0.189574,f,0.1


# `get_dummies()` and `from_dummies()`

In [56]:
# Create DataFrame
df = pd.DataFrame({
    'key': list('bbacab'),
    'data1': range(6)
})

df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [None]:
# Convert categorical variable 'key' into dummy/indicator variables
# Create DataFrame where columns are the unique values of 'key'
# The values of the columns are 1 if the value of 'key' is the same as the column name, 0 otherwise
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [58]:
# Convert categorical variable 'key' into dummy/indicator variables
# Create DataFrame where columns are the unique values of 'key'
# The values of the columns are 1 if the value of 'key' is the same as the column name, 0 otherwise
df['key'].str.get_dummies()

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [59]:
# Get indicator variables, attaching prefix to column variables
pd.get_dummies(df['key'], prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [60]:
# Join dummy variable DataFrame with prefixed column names to original DataFrame
df.join(pd.get_dummies(df['key'], prefix='key'))

Unnamed: 0,key,data1,key_a,key_b,key_c
0,b,0,False,True,False
1,b,1,False,True,False
2,a,2,True,False,False
3,c,3,False,False,True
4,a,4,True,False,False
5,b,5,False,True,False


In [63]:
# Get array of random values
values = np.random.rand(10)
values

array([0.58625145, 0.79177996, 0.64671248, 0.66183694, 0.22016525,
       0.43596694, 0.54685137, 0.27854129, 0.52169576, 0.50833446])

In [64]:
# Sort values into bins
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.cut(values, bins)

[(0.4, 0.6], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [65]:
# Create dummy variables corresponding to whether values are in bins
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,True,False,False
1,False,False,False,True,False
2,False,False,False,True,False
3,False,False,False,True,False
4,False,True,False,False,False
5,False,False,True,False,False
6,False,False,True,False,False
7,False,True,False,False,False
8,False,False,True,False,False
9,False,False,True,False,False


In [66]:
# Create DataFrame
df = pd.DataFrame({
    'A': ['a', 'b', 'a'],
    'B': ['c', 'c', 'b'],
    'C': [1, 2, 3]
})

df

Unnamed: 0,A,B,C
0,a,c,1
1,b,c,2
2,a,b,3


In [None]:
# Get dummy variables for all categorical columns in DataFrame
pd.get_dummies(df)

Unnamed: 0,C,A_a,A_b,B_b,B_c
0,1,True,False,False,True
1,2,False,True,False,True
2,3,True,False,True,False


In [68]:
# Get dummy variables for specified categorical columns in DataFrame
pd.get_dummies(df, columns=['A'])

Unnamed: 0,B,C,A_a,A_b
0,c,1,True,False
1,c,2,False,True
2,b,3,True,False


In [70]:
# Get dummy variables for all categorical columns in DataFrame
# Assign the same prefix to all dummy columns
pd.get_dummies(df, prefix='X')

Unnamed: 0,C,X_a,X_b,X_b.1,X_c
0,1,True,False,False,True
1,2,False,True,False,True
2,3,True,False,True,False


In [72]:
# Get dummy variables for all categorical columns in DataFrame
# Assign different prefixes to dummy columns based on the categorical column
pd.get_dummies(df, prefix=['A', 'B'])

Unnamed: 0,C,A_a,A_b,B_b,B_c
0,1,True,False,False,True
1,2,False,True,False,True
2,3,True,False,True,False


In [73]:
# Get dummy variables for all categorical columns in DataFrame
# Assign different prefixes to dummy columns based on the categorical column
pd.get_dummies(df, prefix={'A': 'A', 'B': 'B'})

Unnamed: 0,C,A_a,A_b,B_b,B_c
0,1,True,False,False,True
1,2,False,True,False,True
2,3,True,False,True,False


In [74]:
# Create DataFrame of dummy variables
df = pd.DataFrame({
    'prefix_a': [0, 1, 0],
    'prefix_b': [1, 0, 1]
})

df

Unnamed: 0,prefix_a,prefix_b
0,0,1
1,1,0
2,0,1


In [None]:
# Convert dummy variables to categorical variables
# Indicate seperator used to separate prefix from categorical value
# Prefix is used as column label for categorical column
pd.from_dummies(df, sep='_')

Unnamed: 0,prefix
0,b
1,a
2,b


In [76]:
# Create DataFrame with dummy variables
df = pd.DataFrame({
    'prefix_a': [0, 1, 0]
})

df

Unnamed: 0,prefix_a
0,0
1,1
2,0


In [79]:
# Convert dummy variables to k categorical variables
# If there are only (k - 1) dummy variables, a default categorical variable value must be specified
pd.from_dummies(df, sep='_', default_category='b')

Unnamed: 0,prefix
0,b
1,a
2,b


# `explode()`

In [80]:
# Create DataFrame with list values
df = pd.DataFrame({
    'keys': ['panda1', 'panda2', 'panda3'],
    'values': [
        ['eats', 'shoots'],
        ['shoots', 'leaves'],
        ['eats', 'leaves']
    ]
})

df

Unnamed: 0,keys,values
0,panda1,"[eats, shoots]"
1,panda2,"[shoots, leaves]"
2,panda3,"[eats, leaves]"


In [81]:
# Expand list values into separate rows
df.explode('values')

Unnamed: 0,keys,values
0,panda1,eats
0,panda1,shoots
1,panda2,shoots
1,panda2,leaves
2,panda3,eats
2,panda3,leaves


In [82]:
# Create Series of lists, including empty lists, and non-lists
s = pd.Series(
    [
        [1, 2, 3],
        'foo',
        [],
        ['a', 'b']
    ]
)

s

0    [1, 2, 3]
1          foo
2           []
3       [a, b]
dtype: object

In [83]:
# Expand list values into separate rows
# Empty lists are represented as NaN
s.explode()

0      1
0      2
0      3
1    foo
2    NaN
3      a
3      b
dtype: object

In [84]:
# Create DataFrame with comma-separated string values
df = pd.DataFrame([
    {'var1': 'a,b,c', 'var2': 1},
    {'var1': 'd,e,f', 'var2': 2}
])

df

Unnamed: 0,var1,var2
0,"a,b,c",1
1,"d,e,f",2


In [85]:
# Split comma-separated strings into lists
df.assign(var1=df['var1'].str.split(','))

Unnamed: 0,var1,var2
0,"[a, b, c]",1
1,"[d, e, f]",2


In [86]:
# Expand list values into separate rows
df.assign(var1=df['var1'].str.split(',')).explode('var1')

Unnamed: 0,var1,var2
0,a,1
0,b,1
0,c,1
1,d,2
1,e,2
1,f,2


# `crosstab()`

In [88]:
# Create lists
a = np.array(['foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object)
b = np.array(['one', 'one', 'two', 'one', 'two', 'one'], dtype=object)
c = np.array(['dull', 'dull', 'bright', 'bright', 'dull', 'bright'], dtype=object)

# Cross tabulate lists
pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])

b,one,one,two,two
c,bright,dull,bright,dull
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,0,1,0
foo,1,2,0,1


In [89]:
# Create frequency table by cross-tabulating only two lists
pd.crosstab(a, b)

col_0,one,two
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1,1
foo,3,1


In [90]:
# Summarize categorical data by cross-tabulating
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
pd.crosstab(foo, bar)


col_0,d,e
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,0
b,0,1


In [91]:
# Summarize categorical data by cross-tabulating
# Include missing categories
foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
pd.crosstab(foo, bar, dropna=False)

col_0,d,e,f
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,0
b,0,1,0
c,0,0,0


## Normalization

In [92]:
# Normalize frequency table to show percentages
pd.crosstab(a, b, normalize=True)

col_0,one,two
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.166667,0.166667
foo,0.5,0.166667


In [None]:
# Normalize frequency table by columns
pd.crosstab(a, b, normalize='columns')

col_0,one,two
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.25,0.5
foo,0.75,0.5


In [94]:
# Normalize frequency table by rows
pd.crosstab(a, b, normalize='index')

col_0,one,two
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.5,0.5
foo,0.75,0.25


In [99]:
# Cross tabulate three series, aggregating the values of the third series with a specified function
pd.crosstab(a, b, values=c, aggfunc="count")

col_0,one,two
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1,1
foo,3,1


## Adding margins

In [100]:
# Cross tabular three series, aggregating the values of the third series with a specified function
# Add margin row and column
pd.crosstab(a, b, values=c, aggfunc="count", normalize=True, margins=True)

col_0,one,two,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.166667,0.166667,0.333333
foo,0.5,0.166667,0.666667
All,0.666667,0.333333,1.0


# `cut()`

In [101]:
# Create array of values
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])

# Bin values into 3 intervals
pd.cut(ages, 3)

[(9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (9.95, 26.667], (26.667, 43.333], (43.333, 60.0], (43.333, 60.0]]
Categories (3, interval[float64, right]): [(9.95, 26.667] < (26.667, 43.333] < (43.333, 60.0]]

In [102]:
# Bin values into specified intervals
pd.cut(ages, [0, 18, 35, 70])

[(0, 18], (0, 18], (0, 18], (0, 18], (18, 35], (18, 35], (18, 35], (35, 70], (35, 70]]
Categories (3, interval[int64, right]): [(0, 18] < (18, 35] < (35, 70]]

In [103]:
# Bin values using according to IntervalIndex
pd.cut(ages, bins=pd.IntervalIndex.from_breaks([0, 40, 70]))

[(0, 40], (0, 40], (0, 40], (0, 40], (0, 40], (0, 40], (0, 40], (40, 70], (40, 70]]
Categories (2, interval[int64, right]): [(0, 40] < (40, 70]]

# `factorize()`

In [104]:
# Create Series of mixed data types, including NaN
x = pd.Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
x

0       A
1       A
2     NaN
3       B
4    3.14
5     inf
dtype: object

In [None]:
# Encode values into integer labels
# Labels correspond to the position of the value in the list of unique values
# NaN values are encoded as -1
pd.factorize(x)

(array([ 0,  0, -1,  1,  2,  3]), Index(['A', 'B', 3.14, inf], dtype='object'))

In [106]:
# Encode values as categorical data
pd.Categorical(x)

['A', 'A', NaN, 'B', 3.14, inf]
Categories (4, object): [3.14, inf, 'A', 'B']