In [1]:
import numpy as np
import pandas as pd

# Grouping

In [2]:
df = pd.DataFrame(np.random.randint(1,5,18).reshape(6,3),
                 index=['a','b','c','a','b','c'],
                 columns=['A','B','C'])
print(df.sort_values(by=['A']))

   A  B  C
a  1  1  4
c  2  3  4
b  3  3  3
c  3  4  2
a  4  4  4
b  4  1  1


In [3]:
df.groupby(by='A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E6DF2697C0>

In [4]:
for name, group in df.groupby(by='A'):
    print(name)
    print(group)

1
   A  B  C
a  1  1  4
2
   A  B  C
c  2  3  4
3
   A  B  C
b  3  3  3
c  3  4  2
4
   A  B  C
a  4  4  4
b  4  1  1


In [5]:
dfg = df.groupby(by='A').max()
print(dfg)


   B  C
A      
1  1  4
2  3  4
3  4  3
4  4  4


In [6]:
df2 = df.groupby(by='A').sum()
print(df2)

   B  C
A      
1  1  4
2  3  4
3  7  5
4  5  5


In [7]:
df2 = df.groupby(by='A').count()
print(df2)

   B  C
A      
1  1  1
2  1  1
3  2  2
4  2  2


In [8]:
tuples = [('x', 'a'), ('x', 'b'), ('y', 'a'), ('y', 'b')]
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
tuples = [('X', 'A'), ('X', 'B'), ('Y', 'A'), ('Y', 'B')]
columns = pd.MultiIndex.from_tuples(tuples, names=['high','low'])
dfm = pd.DataFrame(np.arange(1,17).reshape(4,4) , index = index, columns = columns)
print(dfm)

high           X       Y    
low            A   B   A   B
first second                
x     a        1   2   3   4
      b        5   6   7   8
y     a        9  10  11  12
      b       13  14  15  16


In [9]:
df2 = dfm.groupby(level=0).max()
print(df2)

high    X       Y    
low     A   B   A   B
first                
x       5   6   7   8
y      13  14  15  16


In [10]:
df2 = dfm.groupby(level=1).max()
print(df2)

high     X       Y    
low      A   B   A   B
second                
a        9  10  11  12
b       13  14  15  16


In [11]:
df2 = dfm.groupby(level=0, axis=1).max()
print(df2)

high           X   Y
first second        
x     a        2   4
      b        6   8
y     a       10  12
      b       14  16


In [12]:
df2 = dfm.groupby(level=1, axis=1).max()
print(df2)

low            A   B
first second        
x     a        3   4
      b        7   8
y     a       11  12
      b       15  16


In [13]:
gdf = df.groupby('A').transform(np.sum)
print(gdf)

   B  C
a  1  4
b  7  5
c  7  5
a  5  5
b  5  5
c  3  4


In [14]:
gdf = df.groupby('A').agg(np.sum)
print(gdf)

   B  C
A      
1  1  4
2  3  4
3  7  5
4  5  5


In [15]:
gdf = df.groupby('A').apply(np.sum)
print(gdf)

   A  B  C
A         
1  1  1  4
2  2  3  4
3  6  7  5
4  8  5  5


# Categorization

In [16]:
df = pd.DataFrame(np.arange(1,10).reshape(3,3), 
                       index=['a','b','c'],
                       columns=['A','B','C'])
print(df)

   A  B  C
a  1  2  3
b  4  5  6
c  7  8  9


In [17]:
cat = pd.Categorical(['IV','I','IV'],
                     categories=['I','II','III','IV','V'],
                     ordered=True)
df['cat'] = cat
print(df)

   A  B  C cat
a  1  2  3  IV
b  4  5  6   I
c  7  8  9  IV


In [18]:
print(df['cat'].cat.as_ordered())

a    IV
b     I
c    IV
Name: cat, dtype: category
Categories (5, object): ['I' < 'II' < 'III' < 'IV' < 'V']


In [19]:
print(df.sort_values(by='cat'))

   A  B  C cat
b  4  5  6   I
a  1  2  3  IV
c  7  8  9  IV


In [20]:
print(df.groupby('cat').size())

cat
I      1
II     0
III    0
IV     2
V      0
dtype: int64


In [21]:
df['cat'] = ['IV','I','III']
print(df)

   A  B  C  cat
a  1  2  3   IV
b  4  5  6    I
c  7  8  9  III


In [22]:
df['cat'] = df['cat'].astype('category')
df['cat'] = df['cat'].cat.set_categories(['I','II','III','IV','V'], ordered=True)
print(df)

   A  B  C  cat
a  1  2  3   IV
b  4  5  6    I
c  7  8  9  III


In [23]:
print(df.sort_values(by='cat'))

   A  B  C  cat
b  4  5  6    I
c  7  8  9  III
a  1  2  3   IV
