data preparation: loading, cleaning, transforming, and rearranging. Such tasks are often reported to take up 80% or more of an analyst’s time.

discussing tools for missing data, duplicate data, string manipulation, and some other analytical data transformations. <br>

NaN: not a number
NA: not available

# 1.Hierarchical Indexing

### Example

In [1]:
import numpy as np
import pandas as pd
from pandas import Series 
from pandas import DataFrame as DF

In [2]:
data = DF({'food': ['bacon', 'pulled pork', 'bacon',
        'Pastrami', 'corned beef', 'Bacon',
        'pastrami', 'honey ham', 'nova lox'],
        'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [3]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [4]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [5]:
##small problem:some of the meats are capitalized and others are not.
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
data


Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [6]:
df = data.copy()
df.sort_values(by='animal',inplace=True)
df

Unnamed: 0,food,ounces,animal
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
6,pastrami,3.0,cow
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
5,Bacon,8.0,pig
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [7]:
df = df.set_index(['animal', 'food'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ounces
animal,food,Unnamed: 2_level_1
cow,Pastrami,6.0
cow,corned beef,7.5
cow,pastrami,3.0
pig,bacon,4.0
pig,pulled pork,3.0
pig,bacon,12.0
pig,Bacon,8.0
pig,honey ham,5.0
salmon,nova lox,6.0


In [8]:
df.loc['pig']

Unnamed: 0_level_0,ounces
food,Unnamed: 1_level_1
bacon,4.0
pulled pork,3.0
bacon,12.0
Bacon,8.0
honey ham,5.0


In [9]:
df.loc['pig','bacon']

Unnamed: 0_level_0,Unnamed: 1_level_0,ounces
animal,food,Unnamed: 2_level_1
pig,bacon,4.0
pig,bacon,12.0


In [10]:
df.reset_index()

Unnamed: 0,animal,food,ounces
0,cow,Pastrami,6.0
1,cow,corned beef,7.5
2,cow,pastrami,3.0
3,pig,bacon,4.0
4,pig,pulled pork,3.0
5,pig,bacon,12.0
6,pig,Bacon,8.0
7,pig,honey ham,5.0
8,salmon,nova lox,6.0


Hierarchical indexing is an important feature of pandas that enables you to have multiple (two or more) index levels on an axis.

In [11]:
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
        [1, 2, 3, 1, 3, 1, 2, 2, 3]])

data

a  1    0.638502
   2   -0.033443
   3   -0.744983
b  1   -1.170850
   3   -0.284441
c  1   -0.699366
   2    0.133859
d  2    1.234286
   3   -1.278161
dtype: float64

In [12]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [13]:
data['b']

1   -1.170850
3   -0.284441
dtype: float64

In [14]:
data['b':'d']

b  1   -1.170850
   3   -0.284441
c  1   -0.699366
   2    0.133859
d  2    1.234286
   3   -1.278161
dtype: float64

In [15]:
data.loc[['b', 'd']]

b  1   -1.170850
   3   -0.284441
d  2    1.234286
   3   -1.278161
dtype: float64

Selection is even possible from an “inner” level:

In [16]:
data.loc[:, 2]

a   -0.033443
c    0.133859
d    1.234286
dtype: float64

In [17]:
data.loc[:,[1,3]]

a  1    0.638502
   3   -0.744983
b  1   -1.170850
   3   -0.284441
c  1   -0.699366
d  3   -1.278161
dtype: float64

In [18]:
data.unstack()

Unnamed: 0,1,2,3
a,0.638502,-0.033443,-0.744983
b,-1.17085,,-0.284441
c,-0.699366,0.133859,
d,,1.234286,-1.278161


In [19]:
data.unstack().stack()

a  1    0.638502
   2   -0.033443
   3   -0.744983
b  1   -1.170850
   3   -0.284441
c  1   -0.699366
   2    0.133859
d  2    1.234286
   3   -1.278161
dtype: float64

In [20]:
#another example
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado'],
        ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [21]:
frame.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [22]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [23]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


## Reordering and Sorting Levels

In [24]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [25]:
frame.sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [26]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [27]:
frame.swaplevel(0, 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [28]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


## Summary Statistics by Level

In [29]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [30]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [31]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [32]:
frame.sum(level='state', axis=1)

Unnamed: 0_level_0,state,Ohio,Colorado
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,2
a,2,7,5
b,1,13,8
b,2,19,11


## Indexing with a DataFrame’s columns

In [33]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                    'c': ['one', 'one', 'one', 'two', 'two',
                    'two', 'two'],
                    'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [34]:
frame2 = frame.set_index(['c', 'd'], drop=False)
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


# 2.Combining and Merging Datasets

- pandas.merge connects rows in DataFrames based on one or more keys. This will be familiar to users of SQL or other relational databases, as it implements database join operations. 

- pandas.concat concatenates or “stacks” together objects along an axis.

- The combine_first instance method enables splicing together overlapping data to fill in missing values in one object with values from another.

## I.Database-Style DataFrame Joins

### many-to-one join
the data in df1 has multiple rows labeled a
and b , whereas df2 has only one row for each value in the key column.

In [35]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
print(df1,'\n')
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6 

  key  data2
0   a      0
1   b      1
2   d      2


In [36]:
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [37]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [38]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

print(df1,'\n')
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6 

  key  data2
0   a      0
1   b      1
2   d      2


If the column names are different in each object, you can specify them separately:

In [39]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


The outer join takes the union of the keys, combining the effect of applying both left and right joins:

In [40]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


### many-to-many join

Many-to-many joins form the Cartesian product of the rows. Since there were three
'b' rows in the left DataFrame and two in the right one, there are six 'b' rows in the
result. The join method only affects the distinct key values appearing in the result.

In [41]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

print(df1,'\n')
print(df2)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5 

  key  data2
0   a      0
1   b      1
2   a      2
3   b      3
4   d      4


In [42]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [43]:
pd.merge(df1, df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


To merge with multiple keys, pass a list of column names:

In [44]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar','sh'],
                     'key2': ['one', 'two', 'one','fu'],
                     'lval': [1, 2, 3,4]})

right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
print(left,'\n')
print(right)

  key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
3   sh   fu     4 

  key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7


In [45]:
pd.merge(left ,right, on = ['key1','key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,sh,fu,4.0,
5,bar,two,,7.0


#### Note
To determine which key combinations will appear in the result depending on the
choice of merge method, think of the multiple keys as forming an array of tuples to
be used as a single join key (even though it’s not actually implemented that way).
<br> <br>
When you’re joining columns-on-columns, the indexes on the
passed DataFrame objects are discarded.

### overlapping column names

In [46]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [47]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [48]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,0.0,3.0
2,b,1.0,1.0
3,b,1.0,3.0
4,b,5.0,1.0
5,b,5.0,3.0
6,a,2.0,0.0
7,a,2.0,2.0
8,a,4.0,0.0
9,a,4.0,2.0


### Merging on Index

You can pass left_index=True or right_index=True (or both) to indicate that
the index should be used as the merge key:

In [49]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})

right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

print(left1,'\n')
print(right1)

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5 

   group_val
a        3.5
b        7.0


In [50]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


With hierarchically indexed data, things are more complicated, as joining on index is
implicitly a multiple-key merge:

In [51]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])

print(lefth,'\n')
print(righth)

     key1  key2  data
0    Ohio  2000   0.0
1    Ohio  2001   1.0
2    Ohio  2002   2.0
3  Nevada  2001   3.0
4  Nevada  2002   4.0 

             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11


In [52]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], how='outer')

TypeError: object of type 'NoneType' has no len()

In [None]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer')

Using the indexes of both sides of the merge is also possible:

In [None]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])

right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])

print(left2,'\n')
print(right2)

In [53]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

NameError: name 'left2' is not defined

In [None]:
pd.merge(left2, right2, how='right', left_index=True, right_index=True)

## II.Concatenating Along an Axis

In [54]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [55]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

Additional things to consider whith Concatenate:
- combine distinct elements or intersect?
- concatenated chunks be identifiable?
- "concatenation axis" contain data that need to be preserved? (integer labels usually discarded)

In [56]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

By default concat works along axis=0 , producing another Series. If you pass axis=1, the result will instead be a DataFrame:

In [57]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [58]:
s4 = pd.concat([s1, s3])

In [59]:
pd.concat([s1, s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [60]:
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,


In [61]:
s1

a    0
b    1
dtype: int64

In [62]:
s3

f    5
g    6
dtype: int64

### create a hierarchical index on the concatenation axis.

In [63]:
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [64]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [65]:
pd.concat([s1, s4], keys=['one', 'two'])

one  a    0
     b    1
two  a    0
     b    1
     f    5
     g    6
dtype: int64

In [66]:
pd.concat([s1, s2, s3, s4], axis=1, keys=['one', 'two', 'three', 'four'])

Unnamed: 0,one,two,three,four
a,0.0,,,0.0
b,1.0,,,1.0
c,,2.0,,
d,,3.0,,
e,,4.0,,
f,,,5.0,5.0
g,,,6.0,6.0


The same logic extends to DataFrame objects:

In [67]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])

df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])

print(df1,'\n')
print(df2)

   one  two
a    0    1
b    2    3
c    4    5 

   three  four
a      5     6
c      7     8


In [68]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


If you pass a dict of objects instead of a list, the dict’s keys will be used for the keys
option:

In [69]:
pd.concat({'level1': df1, 'level2': df2}, axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


we can name the created axis levels with the names
argument:

In [70]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


A last consideration concerns DataFrames in which the row index does not contain
any relevant data:

In [71]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1,'\n')
print(df2)

          a         b         c         d
0  0.397356  0.289199  0.660223 -0.204220
1 -0.362578  0.363093 -1.116118  0.407732
2 -0.406142  1.830918 -0.351826 -2.913360 

          b         d         a
0  1.068627  1.041555 -0.732985
1 -0.270621  1.611602 -1.061160


In [72]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.397356,0.289199,0.660223,-0.20422
1,-0.362578,0.363093,-1.116118,0.407732
2,-0.406142,1.830918,-0.351826,-2.91336
3,-0.732985,1.068627,,1.041555
4,-1.06116,-0.270621,,1.611602


## III.Combining data with overlap

In [79]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print('Dataframe 1\n',a,'\n\n','Dataframe2\n',b)

Dataframe 1
 f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64 

 Dataframe2
 f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64


In [74]:
pd.isnull(a)

f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool

In [75]:
#isnull output: 0,2,5
#replaces values of a (with condition 0,2,5 entries only) with b values
np.where(pd.isnull(a), b, a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

combine_first does the same thing. It “patch”es missing data in the calling object with data from the
object you pass:

In [82]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

print(df1,'\n\n',df2)

     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14 

           b         d         a
0  1.291651 -1.868930  0.179375
1  1.187018  0.369183  0.130367


In [83]:
df1.combine_first(df2)

Unnamed: 0,a,b,c,d
0,1.0,1.291651,2,-1.86893
1,0.130367,2.0,6,0.369183
2,5.0,,10,
3,,6.0,14,


In [85]:
df1.merge(df2, how='outer').fillna(df2)

Unnamed: 0,a,b,c,d
0,1.0,1.291651,2.0,-1.86893
1,0.130367,2.0,6.0,0.369183
2,5.0,,10.0,
3,,6.0,14.0,
4,0.179375,1.291651,,-1.86893
5,0.130367,1.187018,,0.369183


In [86]:
df1.merge(df2, how='left').fillna(df2)

Unnamed: 0,a,b,c,d
0,1.0,1.291651,2,-1.86893
1,0.130367,2.0,6,0.369183
2,5.0,,10,
3,,6.0,14,


In [99]:
df

Unnamed: 0,a,b,c,d


In [98]:
for i in range(df1.shape[0]):
    print(pd.isnull(df1.iloc[i]))

a    False
b     True
c    False
Name: 0, dtype: bool
a     True
b    False
c    False
Name: 1, dtype: bool
a    False
b     True
c    False
Name: 2, dtype: bool
a     True
b    False
c    False
Name: 3, dtype: bool


# 3.Reshaping and Pivoting

In [81]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))

In [82]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [83]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [84]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


By default the innermost level is unstacked (same with stack ). You can unstack a dif‐
ferent level by passing a level number or name:

In [85]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [86]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [87]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])

In [88]:
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [89]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [90]:
#Stacking filters out missing data by default
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [91]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [92]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [93]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns=pd.Index(['left', 'right'], name='side'))

In [94]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [95]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [96]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


### Pivoting “Long” to “Wide” Format

In [None]:
pivoted = ldata.pivot('date', 'item')