In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Sorting

In [3]:
# Sort a series according to its index
# The default order is lexicographical (dictionary order)
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [4]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [5]:
# Sort a dataframe
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [6]:
# Sort according to index
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [7]:
# Sort according to columns
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [8]:
# Sort in descending order
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [9]:
# Sort according to the values
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [10]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [11]:
# Missing values are put to the end.
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [12]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [13]:
# Sort a dataframe according to one or several columns
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [14]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [15]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


# Rank

In [16]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [26]:
# rank() assigns each entry its position after sorting. 
# Since there may be identical values, real numbers are assigned instead of integers.
# In this example, obj[1] is the smallest, so its rank is 1.0
# obj[0] and obj[2] are the largest. They should be the last two entries (6th and 7th) after sorting.
# We assign them the average 6.5 = (6+7)/2
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [22]:
df = pd.DataFrame()
df['Value'] = obj 
df

Unnamed: 0,Value
0,7
1,-5
2,7
3,4
4,2
5,0
6,4


In [24]:
df['rank'] = obj.rank()
df

Unnamed: 0,Value,rank
0,7,6.5
1,-5,1.0
2,7,6.5
3,4,4.5
4,2,3.0
5,0,2.0
6,4,4.5


In [27]:
# If we don't want the average, we can use 'method=first'.
# For the value 7, df[0] appears before df[2]. Thus df[0] has a lower rank.
df['rank_first'] = obj.rank(method='first')
df

Unnamed: 0,Value,rank,rank_first
0,7,6.5,6.0
1,-5,1.0,1.0
2,7,6.5,7.0
3,4,4.5,4.0
4,2,3.0,3.0
5,0,2.0,2.0
6,4,4.5,5.0


In [29]:
# Instead of using avg, we can also assign the max position
# In this case, the two values 7 take the 6th and 7th positions. Thus, we assign 7.0
df['rank_max'] = obj.rank(method='max')
df

Unnamed: 0,Value,rank,rank_first,rank_max
0,7,6.5,6.0,7.0
1,-5,1.0,1.0,1.0
2,7,6.5,7.0,7.0
3,4,4.5,4.0,5.0
4,2,3.0,3.0,3.0
5,0,2.0,2.0,2.0
6,4,4.5,5.0,5.0


In [31]:
# Assign in descending order
df['rank_descending'] = obj.rank(ascending=False)
df

Unnamed: 0,Value,rank,rank_first,rank_max,rank_descending
0,7,6.5,6.0,7.0,1.5
1,-5,1.0,1.0,1.0,7.0
2,7,6.5,7.0,7.0,1.5
3,4,4.5,4.0,5.0,3.5
4,2,3.0,3.0,3.0,5.0
5,0,2.0,2.0,2.0,6.0
6,4,4.5,5.0,5.0,3.5


In [34]:
# Apply rank to a dataframe
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [38]:
frame = frame.sort_index(axis=1)
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [39]:
# Rank each column
frame.rank()

Unnamed: 0,a,b,c
0,1.5,3.0,2.0
1,3.5,4.0,3.0
2,1.5,1.0,4.0
3,3.5,2.0,1.0


In [40]:
# Rank each row
frame.rank(axis = 1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0
