In [9]:
import pandas as pd
import numpy as np

# Handling Missing Data

**dropna()**

**fillna()**

**isnull()**

In [10]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [11]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [12]:
string_data.isnull() #or notnull

0    False
1    False
2     True
3    False
dtype: bool

In [13]:
string_data.isnull().sum()

1

In [14]:
string_data.dropna()

0     aardvark
1    artichoke
3      avocado
dtype: object

In [15]:
string_data.fillna(0)

0     aardvark
1    artichoke
2            0
3      avocado
dtype: object

# Filtering Out Missing Data

In [16]:
from numpy import nan as NA

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
data.dropna() #or  data[data.notnull()]

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [20]:
data.dropna(how='all') #Passing how='all' will only drop rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


# Filling in Missing Data


**DataFrame.fillna(value=scalar(0)/dict(main), method=(‘backfill’/ ‘bfill’/ ‘pad’/ ‘ffill’), axis=(0 or ‘index’, 1 or ‘columns) inplace=False/True, limit=int, downcast=dict)**

In [21]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
data.fillna(0,axis=0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [23]:
 data.fillna({1: 0.5, 2: -1}) #filling with dictionary

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,-1.0
2,,0.5,-1.0
3,,6.5,3.0


In [24]:
data.fillna(0, inplace=True)

In [25]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [26]:
df = pd.DataFrame(np.random.randn(6, 3))

**DataFrame.iloc** Purely integer-location based indexing for selection by position.

In [27]:
df.iloc[2:, 0] = NA; df.iloc[4:, 2] = NA

In [28]:
df

Unnamed: 0,0,1,2
0,1.349206,-1.18916,0.504678
1,0.801189,1.30698,1.46769
2,,-0.839623,-0.58168
3,,1.393421,-1.179728
4,,0.956725,
5,,0.625353,


In [29]:
 df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.349206,-1.18916,0.504678
1,0.801189,1.30698,1.46769
2,0.801189,-0.839623,-0.58168
3,0.801189,1.393421,-1.179728
4,0.801189,0.956725,-1.179728
5,0.801189,0.625353,-1.179728


**limit:** For forward and backward filling, maximum number of consecutive periods to fill

In [30]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.349206,-1.18916,0.504678
1,0.801189,1.30698,1.46769
2,0.801189,-0.839623,-0.58168
3,0.801189,1.393421,-1.179728
4,,0.956725,-1.179728
5,,0.625353,-1.179728


In [31]:
 data = pd.Series([1., NA, 3.5, NA, 7])

In [32]:
data.fillna(data.mean(),inplace= True) #filling with mean()

# Hierarchical Indexing

**.stack(): to arrange things in an ordered**

**.unstack()**



In [33]:
data = pd.DataFrame(np.random.randn(10),index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [34]:
data

Unnamed: 0,Unnamed: 1,0
a,1,-0.675823
a,2,-0.563331
a,3,0.070423
b,1,0.032014
b,2,-1.091147
b,3,0.175827
c,1,-0.935691
c,2,-0.782388
d,2,-1.300223
d,3,0.144255


In [35]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [36]:
 data['b':'c']

Unnamed: 0,Unnamed: 1,0
b,1,0.032014
b,2,-1.091147
b,3,0.175827
c,1,-0.935691
c,2,-0.782388


In [37]:
data.stack()#Stack the prescribed level(s) from columns to index 

a  1  0   -0.675823
   2  0   -0.563331
   3  0    0.070423
b  1  0    0.032014
   2  0   -1.091147
   3  0    0.175827
c  1  0   -0.935691
   2  0   -0.782388
d  2  0   -1.300223
   3  0    0.144255
dtype: float64

In [38]:
data.unstack() #dataframe or series.

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,1,2,3
a,-0.675823,-0.563331,0.070423
b,0.032014,-1.091147,0.175827
c,-0.935691,-0.782388,
d,,-1.300223,0.144255


In [39]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])

In [40]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [41]:
 frame.index.names = ['key1', 'key2']

In [42]:
frame.columns.names = ['state', 'color']

In [43]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [44]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


# Reordering and Sorting Levels

**.swaplevel()**



**DataFrame.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, ignore_index=False, key=None)**

In [45]:
frame.swaplevel('key2', 'key1')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [46]:
frame.sort_index(1) #The value 0 identifies the rows, and 1 identifies the columns.

Unnamed: 0_level_0,state,Colorado,Ohio,Ohio
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [47]:
df=pd.DataFrame(np.random.randint(100,size=(3,3)),index=['a','b','c'],columns=['1','2','3'])

In [48]:
df

Unnamed: 0,1,2,3
a,16,10,60
b,48,5,0
c,46,29,91


In [49]:
df.sort_index(1)

Unnamed: 0,1,2,3
a,16,10,60
b,48,5,0
c,46,29,91


# Summary Statistics by Level

**.sum(level='  ')**

In [50]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [51]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [52]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


# Using a DataFrame’s Columns


**DataFrame.set_index(keys: label or array-like or list of labels/arrays, drop=bool(default True), append=bool, default False, inplace=bool, default False, verify_integrity=bool, default False)**

In [53]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],'d': [0, 1, 2, 0, 1, 2, 3]})

In [54]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [55]:
frame2 = frame.set_index(['c', 'd'])

In [56]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [57]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [58]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
