In [1]:
import pandas as pd
import numpy as np

## Function application and mapping

In [4]:
a = pd.DataFrame(np.random.randn(4, 3))
a

Unnamed: 0,0,1,2
0,-0.799335,-1.901701,0.983335
1,-0.473008,-1.423478,0.713562
2,1.287171,2.194731,0.391347
3,0.801501,0.844654,-0.777218


In [5]:
f = lambda x: x.max() - x.min()
a.apply(f)

0    2.086506
1    4.096431
2    1.760554
dtype: float64

In [6]:
a.apply(f, axis=1)

0    2.885036
1    2.137040
2    1.803384
3    1.621873
dtype: float64

In [7]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
a.apply(f)

Unnamed: 0,0,1,2
min,-0.799335,-1.901701,-0.777218
max,1.287171,2.194731,0.983335


\# fazer um describe personalizado /\

In [8]:
f = lambda x: 'oi %f' % x
a.applymap(f)

Unnamed: 0,0,1,2
0,oi -0.799335,oi -1.901701,oi 0.983335
1,oi -0.473008,oi -1.423478,oi 0.713562
2,oi 1.287171,oi 2.194731,oi 0.391347
3,oi 0.801501,oi 0.844654,oi -0.777218


## Sorting and Ranking

In [9]:
a = pd.Series([1,2,3], index=['b', 'a', 'c'])
a.sort_index()

a    2
b    1
c    3
dtype: int64

In [10]:
a = pd.Series([1,7,3], index=['b', 'a', 'c'])
a.order()

b    1
c    3
a    7
dtype: int64

In [11]:
a.order(ascending=False)

a    7
c    3
b    1
dtype: int64

In [13]:
a = pd.Series([1,2,3, np.NaN], index=['b', 'a', 'c', 'd']) # com NaN
a.order()

b     1
a     2
c     3
d   NaN
dtype: float64

## Ranking

In [30]:
a = pd.Series([7, -5, 7, 4, 2, 0, 4, 4])
pd.concat([a, a.rank()], axis=1).sort(1)

Unnamed: 0,0,1
1,-5,1.0
5,0,2.0
4,2,3.0
3,4,5.0
6,4,5.0
7,4,5.0
0,7,7.5
2,7,7.5


In [31]:
a = pd.Series([1,2])
a.rank()

0    1
1    2
dtype: float64

In [33]:
a = pd.Series([1,2,2])
a.rank()

0    1.0
1    2.5
2    2.5
dtype: float64

In [34]:
a.rank(method='first')

0    1
1    2
2    3
dtype: float64

In [37]:
a.rank(method='average') # aaaaaaaaaaaaaaaaaaaaa -> se ordenar, ficaria 1,2,3 -> mas os valores são 1,2,2
# logo pega o 2+3=5 e 5/2 huhuhuhuuhuhhuhuhuhuhuhu
# entendi, mas qual a utilidade?

0    1.0
1    2.5
2    2.5
dtype: float64

## Index with duplicate values

In [39]:
a = pd.Series(range(3), index=['a', 'b', 'a'])
a.index.is_unique

False

In [40]:
a['a']

a    0
a    2
dtype: int64

In [41]:
a.index.drop_duplicates()

Index([u'a', u'b'], dtype='object')

## Summarizing and computing descriptive statistics

In [44]:
df = pd.DataFrame(np.arange(12).reshape((3,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [45]:
df.sum()

0    12
1    15
2    18
3    21
dtype: int64

In [46]:
df.sum(axis=1)

0     6
1    22
2    38
dtype: int64

In [51]:
df[0][0] = np.NaN

In [52]:
df

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,5,6,7
2,8.0,9,10,11


In [53]:
df.sum() # DataFrame ignora NaN

0    12
1    15
2    18
3    21
dtype: float64

In [54]:
df.sum(skipna=False)

0   NaN
1    15
2    18
3    21
dtype: float64

\# treinar com level

In [56]:
df.idxmax()

0    2
1    2
2    2
3    2
dtype: int64

In [57]:
df.idxmin()

0    1
1    0
2    0
3    0
dtype: int64

In [60]:
df.cumsum()

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,6,8,10
2,12.0,15,18,21


In [61]:
df.cumsum(skipna=False)

Unnamed: 0,0,1,2,3
0,,1,2,3
1,,6,8,10
2,,15,18,21


In [62]:
df.describe()

Unnamed: 0,0,1,2,3
count,2.0,3,3,3
mean,6.0,5,6,7
std,2.828427,4,4,4
min,4.0,1,2,3
25%,5.0,3,4,5
50%,6.0,5,6,7
75%,7.0,7,8,9
max,8.0,9,10,11


In [64]:
a = pd.Series(['a', 'b', 'a']) # describe non numeric
a.describe()

count     3
unique    2
top       a
freq      2
dtype: object

In [65]:
df.count()

0    2
1    3
2    3
3    3
dtype: int64

In [66]:
df.min()

0    4
1    1
2    2
3    3
dtype: float64

In [67]:
df.max()

0     8
1     9
2    10
3    11
dtype: float64

In [72]:
df.quantile()

0    6
1    5
2    6
3    7
dtype: float64

In [73]:
df.mean()

0    6
1    5
2    6
3    7
dtype: float64

In [74]:
df.median()

0    6
1    5
2    6
3    7
dtype: float64

In [75]:
df.mad()

0    2.000000
1    2.666667
2    2.666667
3    2.666667
dtype: float64

In [76]:
df.var()

0     8
1    16
2    16
3    16
dtype: float64

In [77]:
df.std()

0    2.828427
1    4.000000
2    4.000000
3    4.000000
dtype: float64

In [78]:
df.skew()

0             NaN
1    1.998401e-15
2    3.996803e-15
3    3.996803e-15
dtype: float64

In [79]:
df.kurt()

0   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

In [80]:
df.cumsum()

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,6,8,10
2,12.0,15,18,21


In [81]:
df.cummin()

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,1,2,3
2,4.0,1,2,3


In [82]:
df.cummax()

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,5,6,7
2,8.0,9,10,11


In [83]:
df.cumprod()

Unnamed: 0,0,1,2,3
0,,1,2,3
1,4.0,5,12,21
2,32.0,45,120,231


In [84]:
df.diff()

Unnamed: 0,0,1,2,3
0,,,,
1,,4.0,4.0,4.0
2,4.0,4.0,4.0,4.0


In [85]:
df.pct_change()

Unnamed: 0,0,1,2,3
0,,,,
1,,4.0,2.0,1.333333
2,1.0,0.8,0.666667,0.571429


## Correlation and Covariance

In [86]:
a = pd.DataFrame({'idade':[10, 20, 40], 'salario':[1, 3000, 8000]})

In [87]:
a.cov()

Unnamed: 0,idade,salario
idade,233.333333,61660
salario,61660.0,16329667


In [88]:
a.corr()

Unnamed: 0,idade,salario
idade,1.0,0.99891
salario,0.99891,1.0


In [89]:
a.idade.corr(a.salario)

0.99891023606595264

In [90]:
a.idade.cov(a.salario)

61660.0

## Unique values, value counts, and membership

In [93]:
a.idade.unique() # unique values

array([10, 20, 40], dtype=int64)

In [94]:
a = pd.Series([1,2,2,3,1,5])
a.value_counts() # sorted by values descending -> não quiser sort, passar sort=False

2    2
1    2
5    1
3    1
dtype: int64

In [96]:
a.isin([1,2])

0     True
1     True
2     True
3    False
4     True
5    False
dtype: bool

In [97]:
a[a.isin([1,2])]

0    1
1    2
2    2
4    1
dtype: int64

In [99]:
data = pd.DataFrame({'a': [1,2,3], 'b': [1,1,2], 'c': [1,3,3]})
data.apply(pd.value_counts).fillna(0) # cool -> para cada coluna, quantas ocorrências de cada valor!

Unnamed: 0,a,b,c
1,1,2,1
2,1,1,0
3,1,0,2
