# Sorting and Ranking

In [1]:
import numpy as np
import pandas as pd

In [2]:
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])

In [3]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [4]:
ser.sort_index() # ascending order

blue      0
green     4
red       5
white     8
yellow    3
dtype: int64

In [6]:
ser.sort_index(ascending=False) # descending order

yellow    3
white     8
red       5
green     4
blue      0
dtype: int64

In [7]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])

In [8]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [9]:
frame.sort_index()

Unnamed: 0,ball,pen,pencil,paper
blue,4,5,6,7
red,0,1,2,3
white,12,13,14,15
yellow,8,9,10,11


In [10]:
frame.sort_index(axis=1)

Unnamed: 0,ball,paper,pen,pencil
red,0,3,1,2
blue,4,7,5,6
yellow,8,11,9,10
white,12,15,13,14


In [11]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [14]:
ser.sort_values()

blue      0
yellow    3
green     4
red       5
white     8
dtype: int64

In [15]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [16]:
frame.sort_index(by='pen')

  """Entry point for launching an IPython kernel.


Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [17]:
frame.sort_values(by='pen')

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


The ranking is an operation closely related to sorting. It mainly consists of assigning a rank (that is, a value that starts at 0 and then increase gradually) to each element of the series. The rank will be assignedstarting from the lowest value to the highest value.

In [18]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [19]:
ser.rank()

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [20]:
ser.rank(method='first')

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [21]:
ser.rank(ascending=False)

red       2.0
blue      5.0
yellow    4.0
white     1.0
green     3.0
dtype: float64

---

# Correlation and Covariance

In [22]:
seq = pd.Series([1,2,3,4,4,3,2,1],
                ['2006','2007','2008','2009','2010','2011','2012','2013'])
seq2 = pd.Series([3,4,3,4,5,4,3,2],
                 ['2006','2007','2008','2009','2010','2011','2012','2013'])

In [23]:
seq

2006    1
2007    2
2008    3
2009    4
2010    4
2011    3
2012    2
2013    1
dtype: int64

In [24]:
seq2

2006    3
2007    4
2008    3
2009    4
2010    5
2011    4
2012    3
2013    2
dtype: int64

In [25]:
seq.corr(seq2) # correlation in Series

0.7745966692414835

In [26]:
seq.cov(seq2) # covariance in Series

0.8571428571428571

In [27]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [29]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])

In [30]:
frame2.corr()

Unnamed: 0,ball,pen,pencil,paper
ball,1.0,-0.276026,0.57735,-0.763763
pen,-0.276026,1.0,-0.079682,-0.361403
pencil,0.57735,-0.079682,1.0,-0.692935
paper,-0.763763,-0.361403,-0.692935,1.0


In [31]:
frame2.cov()

Unnamed: 0,ball,pen,pencil,paper
ball,2.0,-0.666667,2.0,-2.333333
pen,-0.666667,2.916667,-0.333333,-1.333333
pencil,2.0,-0.333333,6.0,-3.666667
paper,-2.333333,-1.333333,-3.666667,4.666667


In [32]:
frame2

Unnamed: 0,ball,pen,pencil,paper
red,1,4,3,6
blue,4,5,6,1
yellow,3,3,1,5
white,4,1,6,4


In [33]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


Using the method corrwith( ), you can calculate the pairwise correlations between the columns or rows of a data frame with a Series or another DataFrame( ).

In [34]:
frame2.corrwith(frame)

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64

In [35]:
frame2.corrwith(ser)

ball     -0.140028
pen      -0.869657
pencil    0.080845
paper     0.595854
dtype: float64

In [36]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

---

# "Not a Number" Data

## Assigning a NaN Value

In [37]:
ser = pd.Series([0,1,2,np.NaN,9], index=['red','blue','yellow','white','green'])

In [38]:
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [39]:
ser['white']

nan

In [40]:
ser['white'] = np.nan

In [41]:
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [43]:
ser['white'] = None

In [44]:
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

## Filtering Out NaN Values

In [45]:
ser.dropna()

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

In [46]:
ser[ser.notnull()]

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

In [47]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
                        index = ['blue','green','red'],
                        columns = ['ball','mug','pen'])

In [48]:
frame3

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [50]:
frame3['mug'].isnull().any()

True

In [51]:
frame3['mug'].notnull().all()

False

In [52]:
frame3['mug'].isnull().all()

True

In [53]:
frame3.dropna(how='all')

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
red,2.0,,5.0


In [57]:
frame3.dropna(how='all', axis=1)

Unnamed: 0,ball,pen
blue,6.0,6.0
green,,
red,2.0,5.0


## Filling in NaN Occurences

In [58]:
frame3

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [60]:
frame3.fillna(0)

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,0.0,0.0,0.0
red,2.0,0.0,5.0


In [61]:
frame3.fillna({'ball' : 1, 'mug' : 0, 'pen' : 11})

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,1.0,0.0,11.0
red,2.0,0.0,5.0


In [63]:
frame3.fillna({'ball' : {'green':1}, 
               'mug' : {'blue':1, 'green':2, 'red':3},
               'pen' : 11})

Unnamed: 0,ball,mug,pen
blue,6.0,1.0,6.0
green,1.0,2.0,11.0
red,2.0,3.0,5.0


---

# Important Points

- sort_index():
    - on Series, will return Series
    - on DataFrame, can specify axis: 0 for index label, 1 for column label
- sort_values():
    - on Series, works normally
    - on DataFrame, can specify on which column respect to (by='column_label')
- rank()
- corr(): Correlation, works for Series or DataFrame
    - Series syntax: object1.corr(object2)
    - DataFrame syntax: object.corr()
- cov(): Covariance, works for Series or DataFrame.
    - Series syntax: object1.cov(object2)
    - DataFrame syntax: object.cov()
- corrwith(): DataFrame to DataFrame, or DataFrame to Series
- dropna(): 
    - parameters: how(all, any), thresh(non-null value), axis
    - return DataFrame or Series without null(depend on the spesification)
- fillna(): fill nan by values
    - parameters: value(applied to all nan value)
    - return DataFrame or Series filled by specified value
    - can be specified with dictionary