# DataFrames - Part 1 (S6L27)

In [25]:
import numpy as np
import pandas as pd

In [26]:
from numpy.random import randn

In [27]:
np.random.seed(101)

In [28]:
df = pd.DataFrame(randn(5,4),['a','b','c','d','e'],['w','x','y','z'])

In [29]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [30]:
df['w']

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [31]:
type(df['w'])

pandas.core.series.Series

In [32]:
type(df)

pandas.core.frame.DataFrame

In [33]:
df.w

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [34]:
df[['w','z']]

Unnamed: 0,w,z
a,2.70685,0.503826
b,0.651118,0.605965
c,-2.018168,-0.589001
d,0.188695,0.955057
e,0.190794,0.683509


In [35]:
df['new']

KeyError: 'new'

In [36]:
df['new'] = df['w'] + df['y']

In [37]:
df

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.614819
b,0.651118,-0.319318,-0.848077,0.605965,-0.196959
c,-2.018168,0.740122,0.528813,-0.589001,-1.489355
d,0.188695,-0.758872,-0.933237,0.955057,-0.744542
e,0.190794,1.978757,2.605967,0.683509,2.796762


In [38]:
df.drop('new')

KeyError: "['new'] not found in axis"

In [39]:
df.drop('new',axis=1)

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [40]:
df

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.614819
b,0.651118,-0.319318,-0.848077,0.605965,-0.196959
c,-2.018168,0.740122,0.528813,-0.589001,-1.489355
d,0.188695,-0.758872,-0.933237,0.955057,-0.744542
e,0.190794,1.978757,2.605967,0.683509,2.796762


In [41]:
df.drop('new',axis=1,inplace=True)

In [42]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [43]:
df.drop('e')

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057


In [44]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [45]:
df.shape

(5, 4)

In [46]:
df['y']

a    0.907969
b   -0.848077
c    0.528813
d   -0.933237
e    2.605967
Name: y, dtype: float64

In [47]:
df[['z','x']]

Unnamed: 0,z,x
a,0.503826,0.628133
b,0.605965,-0.319318
c,-0.589001,0.740122
d,0.955057,-0.758872
e,0.683509,1.978757


## Selecting Rows

In [48]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [49]:
df.loc['a']

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [50]:
df.iloc[2]

w   -2.018168
x    0.740122
y    0.528813
z   -0.589001
Name: c, dtype: float64

In [51]:
df.loc['b','y']

-0.8480769834036315

In [52]:
df.loc[['a','e'],['x','y']]

Unnamed: 0,x,y
a,0.628133,0.907969
e,1.978757,2.605967


# DataFrames - Part 2 (S6L28)

In [53]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [54]:
df > 0

Unnamed: 0,w,x,y,z
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True
e,True,True,True,True


In [55]:
booldf = df > 0

In [57]:
df[booldf]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [58]:
df[df>0]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [59]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [60]:
df['w']>0

a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [61]:
df['w']

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [68]:
df[df['z']<0]

Unnamed: 0,w,x,y,z
c,-2.018168,0.740122,0.528813,-0.589001


In [71]:
resultdf = df[df['w']>0]

In [73]:
resultdf['x']

a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [75]:
df[df['w']>0]['x']

a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [83]:
boolseries = df['w']>0
result = df[boolseries]
mycols = ['y','x']
result[mycols]

Unnamed: 0,y,x
a,0.907969,0.628133
b,-0.848077,-0.319318
d,-0.933237,-0.758872
e,2.605967,1.978757


In [84]:
result

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [88]:
df[(df['w']>0) and (df['y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [90]:
df[(df['w']>0) & (df['y']>1)]

Unnamed: 0,w,x,y,z
e,0.190794,1.978757,2.605967,0.683509


In [91]:
df[(df['w']>0) or (df['y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [92]:
df[(df['w']>0) | (df['y']>1)]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [93]:
 df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [94]:
df.reset_index()

Unnamed: 0,index,w,x,y,z
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057
4,e,0.190794,1.978757,2.605967,0.683509


In [95]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [96]:
newind = 'ca ny we or co'.split()

In [97]:
newind

['ca', 'ny', 'we', 'or', 'co']

In [98]:
df['States'] = newind

In [99]:
df

Unnamed: 0,w,x,y,z,States
a,2.70685,0.628133,0.907969,0.503826,ca
b,0.651118,-0.319318,-0.848077,0.605965,ny
c,-2.018168,0.740122,0.528813,-0.589001,we
d,0.188695,-0.758872,-0.933237,0.955057,or
e,0.190794,1.978757,2.605967,0.683509,co


In [101]:
df.set_index('States')

Unnamed: 0_level_0,w,x,y,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ca,2.70685,0.628133,0.907969,0.503826
ny,0.651118,-0.319318,-0.848077,0.605965
we,-2.018168,0.740122,0.528813,-0.589001
or,0.188695,-0.758872,-0.933237,0.955057
co,0.190794,1.978757,2.605967,0.683509


In [102]:
df

Unnamed: 0,w,x,y,z,States
a,2.70685,0.628133,0.907969,0.503826,ca
b,0.651118,-0.319318,-0.848077,0.605965,ny
c,-2.018168,0.740122,0.528813,-0.589001,we
d,0.188695,-0.758872,-0.933237,0.955057,or
e,0.190794,1.978757,2.605967,0.683509,co


# DataFrames - Part 3 (S6L29)

In [108]:
#index levels
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [109]:
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [113]:
df = pd.DataFrame(randn(6,2),hier_index,['a','b'])
df

Unnamed: 0,Unnamed: 1,a,b
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [117]:
df.loc['G1']

Unnamed: 0,a,b
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [118]:
df.loc['G1'].loc[1]

a    0.302665
b    1.693723
Name: 1, dtype: float64

In [121]:
df.index.names = ['Groups', 'Num']

In [122]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [123]:
df.loc['G2']

Unnamed: 0_level_0,a,b
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.166905,0.184502
2,0.807706,0.07296
3,0.638787,0.329646


In [125]:
df.loc['G2'].loc[2]

a    0.807706
b    0.072960
Name: 2, dtype: float64

In [126]:
df.loc['G2'].loc[2]['b']

0.07295967531703869

In [127]:
df.xs

<bound method NDFrame.xs of                    a         b
Groups Num                    
G1     1    0.302665  1.693723
       2   -1.706086 -1.159119
       3   -0.134841  0.390528
G2     1    0.166905  0.184502
       2    0.807706  0.072960
       3    0.638787  0.329646>

In [128]:
df.xs('G1')

Unnamed: 0_level_0,a,b
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [132]:
df.xs(1, level='Num')

Unnamed: 0_level_0,a,b
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
