# Pandas Basics 2

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
                   'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
                   'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


Series and DataFrame have the binary comparison methods eq, ne, lt, gt, le, and ge whose behavior is vectorized

In [4]:
df.gt(0)

Unnamed: 0,one,two,three
a,False,True,False
b,False,False,True
c,False,False,True
d,False,True,False


> Note:  NaN != NaN

You can apply the reductions: `empty`, `any()`, `all()`, and `bool()` to provide a way to summarize a boolean result.

In [12]:
# Goes through columns of the data frame
df.gt(0).any() 
df.gt(0).all()

one      False
two       True
three     True
dtype: bool

one      False
two      False
three    False
dtype: bool

### Comparisons

In [24]:
# comparing to scalar
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [25]:
# comparing to vector of same length
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

**A note about NaN**

In [27]:
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


In [28]:
df + df
df * 2

Unnamed: 0,one,two,three
a,-0.630899,0.941951,
b,-1.987226,-1.578319,1.108915
c,-4.383556,-1.16701,3.066848
d,,0.663579,-0.946697


Unnamed: 0,one,two,three
a,-0.630899,0.941951,
b,-1.987226,-1.578319,1.108915
c,-4.383556,-1.16701,3.066848
d,,0.663579,-0.946697


In [29]:
# NaN are not considered equal
df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [30]:
# .equals method consideres NaN to be equal
(df + df).equals(df * 2) 

True

# Statistical Methods

In [34]:
df.describe()

Unnamed: 0,one,two,three
count,3.0,4.0,3.0
mean,-1.166947,-0.142475,0.538178
std,0.950098,0.636123,1.003485
min,-2.191778,-0.78916,-0.473349
25%,-1.592696,-0.634919,0.040554
50%,-0.993613,-0.125858,0.554457
75%,-0.654531,0.366586,1.043941
max,-0.315449,0.470975,1.533424


In [32]:
df.count()

one      3
two      4
three    3
dtype: int64

In [31]:
df.mean()

one     -1.166947
two     -0.142475
three    0.538178
dtype: float64

In [35]:
df.std()

one      0.950098
two      0.636123
three    1.003485
dtype: float64

In [36]:
df.min()

one     -2.191778
two     -0.789160
three   -0.473349
dtype: float64

In [39]:
df.quantile(.9)

one     -0.451082
two      0.429220
three    1.337631
Name: 0.9, dtype: float64

In [37]:
df.max()

one     -0.315449
two      0.470975
three    1.533424
dtype: float64

In [40]:
df.sum()

one     -3.500841
two     -0.569900
three    1.614533
dtype: float64

In [48]:
df.cumsum()

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-1.309063,-0.318184,0.554457
c,-3.500841,-0.901689,2.087881
d,,-0.5699,1.614533


In [49]:
df.cumprod()

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,0.313435,-0.371675,0.554457
c,-0.686979,0.216874,0.850218
d,,0.071957,-0.40245


In [50]:
df.cummax()

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.315449,0.470975,0.554457
c,-0.315449,0.470975,1.533424
d,,0.470975,1.533424


In [51]:
df.cummin()

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.78916,0.554457
d,,-0.78916,-0.473349


In [41]:
df.median()

one     -0.993613
two     -0.125858
three    0.554457
dtype: float64

In [42]:
df.mode()

Unnamed: 0,one,two,three
0,-2.191778,-0.78916,-0.473349
1,-0.993613,-0.583505,0.554457
2,-0.315449,0.33179,1.533424
3,,0.470975,


In [44]:
df.abs()

Unnamed: 0,one,two,three
a,0.315449,0.470975,
b,0.993613,0.78916,0.554457
c,2.191778,0.583505,1.533424
d,,0.33179,0.473349


In [45]:
df.prod()

one     -0.686979
two      0.071957
three   -0.402450
dtype: float64

In [46]:
df.var()

one      0.902686
two      0.404652
three    1.006983
dtype: float64

In [47]:
df.sem()

one      0.548539
two      0.318061
three    0.579363
dtype: float64

The `idxmin()` and `idxmax()` functions on Series and DataFrame compute the index labels with the minimum and maximum corresponding values:

In [53]:
df.idxmin()

one      c
two      b
three    d
dtype: object

In [54]:
df.idxmax()

one      a
two      a
three    c
dtype: object

# Iterating in pandas

The behavior of basic iterations over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iterations produces the values. DataFrames follow the dict-like convention of iterating over the keys of the objects.

In short, basic iteration (for i in object) produces:

Series: values  
DataFrame: column labels

In [55]:
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


In [57]:
for col in df: print(col)

one
two
three


In [58]:
for val in df["one"]: print(val)

-0.31544938644984516
-0.9936131515664792
-2.1917781265470784
nan


In [63]:
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


In [62]:
for column_name, series in df.items(): 
    print(column_name)
    print(series)
    print()

one
a   -0.315449
b   -0.993613
c   -2.191778
d         NaN
Name: one, dtype: float64

two
a    0.470975
b   -0.789160
c   -0.583505
d    0.331790
Name: two, dtype: float64

three
a         NaN
b    0.554457
c    1.533424
d   -0.473349
Name: three, dtype: float64



In [66]:
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


In [65]:
for row_index, row in df.iterrows():
        print(row_index, row, sep='\n')
        print()

a
one     -0.315449
two      0.470975
three         NaN
Name: a, dtype: float64

b
one     -0.993613
two     -0.789160
three    0.554457
Name: b, dtype: float64

c
one     -2.191778
two     -0.583505
three    1.533424
Name: c, dtype: float64

d
one           NaN
two      0.331790
three   -0.473349
Name: d, dtype: float64



In [67]:
df

Unnamed: 0,one,two,three
a,-0.315449,0.470975,
b,-0.993613,-0.78916,0.554457
c,-2.191778,-0.583505,1.533424
d,,0.33179,-0.473349


In [68]:
for row in df.itertuples():
        print(row)

Pandas(Index='a', one=-0.31544938644984516, two=0.47097539564777774, three=nan)
Pandas(Index='b', one=-0.9936131515664792, two=-0.7891597410446118, three=0.5544573685891188)
Pandas(Index='c', one=-2.1917781265470784, two=-0.5835047741330583, three=1.5334240581411989)
Pandas(Index='d', one=nan, two=0.33178960042176453, three=-0.47334857587286827)
