In [88]:
import pandas as pd
import numpy as np
%matplotlib inline

## Function application and mapping

In [2]:
a = pd.DataFrame(np.random.randn(4, 3))
a

Unnamed: 0,0,1,2
0,-0.719675,-0.662708,1.343557
1,-0.729003,-1.265131,-0.831226
2,-0.183765,-1.398486,1.158494
3,-1.074673,-1.717486,-0.536596


In [3]:
f = lambda x: x.max() - x.min()
a.apply(f)

0    0.890908
1    1.054778
2    2.174783
dtype: float64

In [4]:
a.apply(f, axis=1)

0    2.063232
1    0.536128
2    2.556981
3    1.180890
dtype: float64

In [5]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
a.apply(f)

Unnamed: 0,0,1,2
min,-1.074673,-1.717486,-0.831226
max,-0.183765,-0.662708,1.343557


In [6]:
a.applymap(lambda x: 'oi %f' % x)

Unnamed: 0,0,1,2
0,oi -0.719675,oi -0.662708,oi 1.343557
1,oi -0.729003,oi -1.265131,oi -0.831226
2,oi -0.183765,oi -1.398486,oi 1.158494
3,oi -1.074673,oi -1.717486,oi -0.536596


## Sorting and Ranking

In [7]:
a = pd.Series([1,2,3], index=['b', 'a', 'c'])
a.sort_index()

a    2
b    1
c    3
dtype: int64

In [9]:
a = pd.Series([1,7,3], index=['b', 'a', 'c'])
a.sort_values()

b    1
c    3
a    7
dtype: int64

In [10]:
a.sort_values(ascending=False)

a    7
c    3
b    1
dtype: int64

In [15]:
a = pd.Series([1,2,3, np.NaN], index=['b', 'a', 'c', 'd']) # com NaN
a.sort_values()

b    1.0
a    2.0
c    3.0
d    NaN
dtype: float64

In [16]:
a.sort_values(na_position='first')

d    NaN
b    1.0
a    2.0
c    3.0
dtype: float64

In [18]:
frame = pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [20]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [22]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


## Ranking

In [34]:
a = pd.Series([7, -5, 7, 4, 2, 0, 4, 4])
pd.concat([a, a.rank()], axis=1).sort_values(by=1)
# by default rank breaks ties by assigning each group the mean rank

Unnamed: 0,0,1
1,-5,1.0
5,0,2.0
4,2,3.0
3,4,5.0
6,4,5.0
7,4,5.0
0,7,7.5
2,7,7.5


In [35]:
a = pd.Series([1,2])
a.rank()

0    1.0
1    2.0
dtype: float64

In [47]:
a = pd.Series([0,1,1,2,3,3])

In [48]:
a.rank(method='first')

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64

In [49]:
a.rank(method='max')

0    1.0
1    3.0
2    3.0
3    4.0
4    6.0
5    6.0
dtype: float64

In [50]:
a.rank(method='min')

0    1.0
1    2.0
2    2.0
3    4.0
4    5.0
5    5.0
dtype: float64

In [52]:
a.rank(method='dense')
# dense: like 'min', but rank always increases by 1 between groups

0    1.0
1    2.0
2    2.0
3    3.0
4    4.0
5    4.0
dtype: float64

In [53]:
a.rank(method='average') # aaaaaaaaaaaaaaaaaaaaa -> se ordenar, ficaria 1,2,3 -> mas os valores são 1,2,2
# logo pega o 2+3=5 e 5/2 huhuhuhuuhuhhuhuhuhuhuhu
# entendi, mas qual a utilidade?

0    1.0
1    2.5
2    2.5
3    4.0
4    5.5
5    5.5
dtype: float64

## Index with duplicate values

In [54]:
a = pd.Series(range(3), index=['a', 'b', 'a'])
a.index.is_unique

False

In [55]:
a['a']

a    0
a    2
dtype: int64

In [60]:
a.loc[a.index.drop_duplicates()]

a    0
a    2
b    1
dtype: int64

In [61]:
a.loc[a.index.drop_duplicates(keep='last')]

b    1
a    0
a    2
dtype: int64

## Summarizing and computing descriptive statistics

In [62]:
df = pd.DataFrame(np.arange(12).reshape((3,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [63]:
df.sum()

0    12
1    15
2    18
3    21
dtype: int64

In [64]:
df.sum(axis=1)

0     6
1    22
2    38
dtype: int64

In [66]:
df[0][0] = np.NaN
df

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,5,6,7
2,8.0,9,10,11


In [67]:
df.sum() # DataFrame ignora NaN

0    12.0
1    15.0
2    18.0
3    21.0
dtype: float64

In [68]:
df.sum(skipna=False)

0     NaN
1    15.0
2    18.0
3    21.0
dtype: float64

In [69]:
df.idxmax()

0    2
1    2
2    2
3    2
dtype: int64

In [70]:
df.idxmin()

0    1
1    0
2    0
3    0
dtype: int64

In [71]:
df.cumsum()

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,4.0,6.0,8.0,10.0
2,12.0,15.0,18.0,21.0


In [72]:
df.cumsum(skipna=False)

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,,6.0,8.0,10.0
2,,15.0,18.0,21.0


In [74]:
df.fillna(0).describe()

Unnamed: 0,0,1,2,3
count,3.0,3.0,3.0,3.0
mean,4.0,5.0,6.0,7.0
std,4.0,4.0,4.0,4.0
min,0.0,1.0,2.0,3.0
25%,2.0,3.0,4.0,5.0
50%,4.0,5.0,6.0,7.0
75%,6.0,7.0,8.0,9.0
max,8.0,9.0,10.0,11.0


In [75]:
a = pd.Series(['a', 'b', 'a']) # describe non numeric
a.describe()

count     3
unique    2
top       a
freq      2
dtype: object

In [76]:
df.count()

0    2
1    3
2    3
3    3
dtype: int64

In [77]:
df.min()

0    4.0
1    1.0
2    2.0
3    3.0
dtype: float64

In [78]:
df.max()

0     8.0
1     9.0
2    10.0
3    11.0
dtype: float64

In [80]:
df.fillna(0).quantile()

0    4.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [81]:
df.mean()

0    6.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [82]:
df.median()

0    6.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [84]:
df.mad() # Return the mean absolute deviation of the values for the requested axis

0    2.000000
1    2.666667
2    2.666667
3    2.666667
dtype: float64

In [86]:
df.var() # Return unbiased variance over requested axis.

0     8.0
1    16.0
2    16.0
3    16.0
dtype: float64

In [101]:
df.std() # Return sample standard deviation over requested axis.

0    2.828427
1    4.000000
2    4.000000
3    4.000000
dtype: float64

In [104]:
df.skew()
# Return unbiased skew over requested axis
# Normalized by N-1

0    NaN
1    0.0
2    0.0
3    0.0
dtype: float64

In [105]:
df.kurt() 
# Return unbiased kurtosis over requested axis using Fisher's definition of
# kurtosis (kurtosis of normal == 0.0). Normalized by N-1

0   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

In [106]:
df.cumsum()

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,4.0,6.0,8.0,10.0
2,12.0,15.0,18.0,21.0


In [107]:
df.cummin()

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,4.0,1.0,2.0,3.0
2,4.0,1.0,2.0,3.0


In [108]:
df.cummax()

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [109]:
df.cumprod()

Unnamed: 0,0,1,2,3
0,,1.0,2.0,3.0
1,4.0,5.0,12.0,21.0
2,32.0,45.0,120.0,231.0


In [111]:
df

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,5,6,7
2,8.0,9,10,11


In [114]:
df.diff(periods=1) # 1st discrete difference of object # cool :)

Unnamed: 0,0,1,2,3
0,,,,
1,,4.0,4.0,4.0
2,4.0,4.0,4.0,4.0


In [115]:
df.pct_change()

Unnamed: 0,0,1,2,3
0,,,,
1,,4.0,2.0,1.333333
2,1.0,0.8,0.666667,0.571429


In [129]:
rr = df[:-1]
rr.index = [1,2]
df.diff().div(rr)

Unnamed: 0,0,1,2,3
0,,,,
1,,4.0,2.0,1.333333
2,1.0,0.8,0.666667,0.571429


In [135]:
a = pd.DataFrame(np.arange(0, 12).reshape((4,3)))
b = pd.DataFrame(np.arange(10, 19).reshape((3,3)))

a.div(b)

Unnamed: 0,0,1,2
0,0.0,0.090909,0.166667
1,0.230769,0.285714,0.333333
2,0.375,0.411765,0.444444
3,,,


## Correlation and Covariance

In [136]:
a = pd.DataFrame({'idade':[10, 20, 40], 'salario':[1, 3000, 8000]})

In [137]:
a.cov()

Unnamed: 0,idade,salario
idade,233.333333,61660.0
salario,61660.0,16329667.0


In [138]:
a.corr()

Unnamed: 0,idade,salario
idade,1.0,0.99891
salario,0.99891,1.0


In [139]:
a.idade.corr(a.salario)

0.99891023606595264

In [140]:
a.idade.cov(a.salario)

61660.0

## Unique values, value counts, and membership

In [141]:
a.idade.unique() # unique values

array([10, 20, 40])

In [142]:
a = pd.Series([1,2,2,3,1,5])
a.value_counts() # sorted by values descending -> não quiser sort, passar sort=False

2    2
1    2
5    1
3    1
dtype: int64

In [143]:
a.isin([1,2])

0     True
1     True
2     True
3    False
4     True
5    False
dtype: bool

In [144]:
a[a.isin([1,2])]

0    1
1    2
2    2
4    1
dtype: int64

In [145]:
data = pd.DataFrame({'a': [1,2,3], 'b': [1,1,2], 'c': [1,3,3]})
data.apply(pd.value_counts).fillna(0) # cool -> para cada coluna, quantas ocorrências de cada valor!

Unnamed: 0,a,b,c
1,1,2.0,1.0
2,1,1.0,0.0
3,1,0.0,2.0
