In [1]:
%matplotlib inline
from IPython.core.display import HTML
from IPython.display import YouTubeVideo
from pandas_datareader import data, wb

import os
import pandas as pd
import numpy as np
import datetime

path1 = os.path.join(os.getcwd(),'style-table.css')
path2 = os.path.join(os.getcwd(),'style-notebook.css')

css = open(path1).read() + open(path2).read()
HTML('<style>{}</style>'.format(css))

In [2]:
# pd.merge(df1,df2) = merge based on a specified 'key' column (or key index)


# more info on merging
# http://pandas.pydata.org/pandas-docs/stable/merging.html

### merging dataframes by vales on a single key  using pd.merge()

In [3]:
# make a dataframe from a dictionary of lists (2x6)

dict1 = {   'key' : ['X','Y','Z','X','Y','Z'],
     'data_set_1' : np.arange(6) }

# make a second dataframe from a dictionary of lists (2x3)

dict2 = {   'key' : ['Q','Y','Z'],
     'data_set_2' : np.arange(3) }

dframe1 = pd.DataFrame(dict1)
dframe2 = pd.DataFrame(dict2)

dframe1

Unnamed: 0,data_set_1,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,Y
5,5,Z


In [4]:
dframe2

Unnamed: 0,data_set_2,key
0,0,Q
1,1,Y
2,2,Z


In [5]:
# by default, merge will use the 'key' column

pd.merge(dframe1,dframe2)

Unnamed: 0,data_set_1,key,data_set_2
0,1,Y,1
1,4,Y,1
2,2,Z,2
3,5,Z,2


In [8]:
# define on='key' explicitly and get the same result

pd.merge(dframe1,dframe2,on='key')

Unnamed: 0,data_set_1,key,data_set_2
0,1,Y,1
1,4,Y,1
2,2,Z,2
3,5,Z,2


In [9]:
# merge on 'key' column, keep all rows from the left df
# dframe2 had no X values so it is forced to display nulls

pd.merge(dframe1,dframe2,on='key',how='left')

Unnamed: 0,data_set_1,key,data_set_2
0,0,X,
1,1,Y,1.0
2,2,Z,2.0
3,3,X,
4,4,Y,1.0
5,5,Z,2.0


In [10]:
# merge on 'key' column, keep all rows from the right df
# dframe1 had no Q values so it is forced to display nulls

pd.merge(dframe1,dframe2,on='key',how='right')

Unnamed: 0,data_set_1,key,data_set_2
0,1.0,Y,1
1,4.0,Y,1
2,2.0,Z,2
3,5.0,Z,2
4,,Q,0


In [11]:
# use how='outer' to keep everything from both dfs
# a "union" method

pd.merge(dframe1,dframe2,on='key',how='outer')

Unnamed: 0,data_set_1,key,data_set_2
0,0.0,X,
1,3.0,X,
2,1.0,Y,1.0
3,4.0,Y,1.0
4,2.0,Z,2.0
5,5.0,Z,2.0
6,,Q,0.0


In [16]:
# make 2 dataframes from a dictionaries of lists

dict3 = {   'key' : ['X','X','X','Y','Z','Z'],
     'data_set_3' : np.arange(6) }

dict4 = {   'key' : ['Y','Y','X','X','Z'],
     'data_set_4' : np.arange(5) }

dframe3 = pd.DataFrame(dict3)
dframe4 = pd.DataFrame(dict4)

pd.merge(dframe3,dframe4)

Unnamed: 0,data_set_3,key,data_set_4
0,0,X,2
1,0,X,3
2,1,X,2
3,1,X,3
4,2,X,2
5,2,X,3
6,3,Y,0
7,3,Y,1
8,4,Z,4
9,5,Z,4


### merging dataframes by values on multiple keys  using pd.merge()

In [25]:
df_left = pd.DataFrame({ 'key1' : ['SF','SF','LA'],
                         'key2' : ['one','two','one'],
                    'left_data' : [10,20,30] })

df_right = pd.DataFrame({'key1' : ['SF','SF','LA','LA'],
                         'key2' : ['one','two','one','two'],
                   'right_data' : [40,50,60,70] })



In [26]:
df_left

Unnamed: 0,key1,key2,left_data
0,SF,one,10
1,SF,two,20
2,LA,one,30


In [27]:
df_right

Unnamed: 0,key1,key2,right_data
0,SF,one,40
1,SF,two,50
2,LA,one,60
3,LA,two,70


In [20]:
# do a union (keep all rows) merge on both key1 and key2
# this way we can check what values both datasets had for a row

pd.merge(df_left,df_right,on=['key1','key2'],how='outer')

Unnamed: 0,key1,key2,left_data,right_data
0,SF,one,10.0,40
1,SF,two,20.0,50
2,LA,one,30.0,60
3,LA,two,,70


In [21]:
# if you want to keep 2 columns with same name from 2 dfs
# pandas will keep and auto-rename them in the merged df

pd.merge(df_left,df_right,on='key1')

Unnamed: 0,key1,key2_x,left_data,key2_y,right_data
0,SF,one,10,one,40
1,SF,one,10,two,50
2,SF,two,20,one,40
3,SF,two,20,two,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [24]:
# add custom suffixes to the renamed columns

pd.merge(df_left,df_right,on='key1',suffixes=('_(left)','_(right)'))

Unnamed: 0,key1,key2_(left),left_data,key2_(right),right_data
0,SF,one,10,one,40
1,SF,one,10,two,50
2,SF,two,20,one,40
3,SF,two,20,two,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [None]:
# further documentation about merge()

# http://pandas.pydata.org/pandas-docs/version/0.17.1/generated/pandas.DataFrame.merge.html

### merging dataframes by index using pd.merge()

In [2]:
dict1 = {   'key' : ['X','Y','Z','X','Y'],
           'data' : range(5) }

dframe1 = pd.DataFrame(dict1)
dframe1

Unnamed: 0,data,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,Y


In [3]:
dict2 = { 'grp_data' : [10,20] }

dframe2 = pd.DataFrame(dict2 , index=['X','Y'])
dframe2

Unnamed: 0,grp_data
X,10
Y,20


In [8]:
# merging using the key column for the left df 
# merging using the index for the right df

pd.merge(dframe1,dframe2,left_on='key',right_index=True)

Unnamed: 0,data,key,grp_data
0,0,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


### merging dataframes with hierarchy index using pd.merge()

In [7]:
dict1 = {   'key1' : ['SF','SF','SF','LA','LA'],
            'key2' : [10,20,30,20,30],
            'data_set' : np.arange(5.)  }

df_left_hr = pd.DataFrame(dict1)
df_left_hr

Unnamed: 0,data_set,key1,key2
0,0,SF,10
1,1,SF,20
2,2,SF,30
3,3,LA,20
4,4,LA,30


In [10]:
df_right_hr = pd.DataFrame(np.arange(10).reshape(5,2),
                           index=[['LA','LA','SF','SF','SF'],
                                  [20,10,10,10,20]],
                         columns=['col_1','col_2',])
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col_2
LA,20,0,1
LA,10,2,3
SF,10,4,5
SF,10,6,7
SF,20,8,9


In [11]:
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0,SF,10,4,5
0,0,SF,10,6,7
1,1,SF,20,8,9
3,3,LA,20,0,1


### merging dataframes using .join()

In [16]:
# the pandas automatic join method
# can pass same arguments as pd.merge()

dframe1.join(dframe2)

Unnamed: 0,data,key,grp_data
0,0,X,
1,1,Y,
2,2,Z,
3,3,X,
4,4,Y,


### concatenation of numpy arrays using np.concatenate()

In [18]:
arr1 = np.arange(9).reshape(3,3)
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [19]:
# concat the np matrix with itself on axis 0 (default)

np.concatenate([arr1,arr1],axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [20]:
# concat the np matrix with itself on axis 1

np.concatenate([arr1,arr1],axis=1)

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

### concatenation of series using pd.concat()

In [3]:
series1 = pd.Series([0,1,2],index=['T','U','V'])
series2 = pd.Series([3,4],index=['X','Y'])

In [4]:
series1

T    0
U    1
V    2
dtype: int64

In [5]:
series2

X    3
Y    4
dtype: int64

In [6]:
# concatenate 2 series into a dataframe
# series1 did not have X Y information
# series2 did not have T U V information

pd.concat([series1,series2],axis=1)

Unnamed: 0,0,1
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [10]:
# default axis concatenation

pd.concat([series1,series2])

T    0
U    1
V    2
X    3
Y    4
dtype: int64

In [11]:
# default axis concatenation
# add names to the series

pd.concat([series1,series2],keys=['cat1','cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
dtype: int64

### concatenation of dataframes using pd.concat()

In [4]:
df1 = pd.DataFrame(np.random.randn(4,3),columns=['X','Y','Z'])
df2 = pd.DataFrame(np.random.randn(4,3),columns=['Y','Q','X'])

df1

Unnamed: 0,X,Y,Z
0,-0.384537,-0.016669,-1.188617
1,0.314467,-0.642357,-1.23748
2,-0.449153,0.27742,0.371142
3,-0.196756,0.106224,0.253406


In [5]:
df2

Unnamed: 0,Y,Q,X
0,-1.679106,-0.44649,-0.182655
1,-1.660777,1.455062,-0.381497
2,-1.917147,-0.067094,0.346649
3,-0.400267,0.554554,-0.609612


In [7]:
# bind the two dataframes together
# labels non-existing values as null
# index information kept

pd.concat([df1,df2])

Unnamed: 0,Q,X,Y,Z
0,,-0.384537,-0.016669,-1.188617
1,,0.314467,-0.642357,-1.23748
2,,-0.449153,0.27742,0.371142
3,,-0.196756,0.106224,0.253406
0,-0.44649,-0.182655,-1.679106,
1,1.455062,-0.381497,-1.660777,
2,-0.067094,0.346649,-1.917147,
3,0.554554,-0.609612,-0.400267,


In [8]:
# bind the two dataframes together
# labels non-existing values as null
# index information not kept, index is remade

pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,Q,X,Y,Z
0,,-0.384537,-0.016669,-1.188617
1,,0.314467,-0.642357,-1.23748
2,,-0.449153,0.27742,0.371142
3,,-0.196756,0.106224,0.253406
4,-0.44649,-0.182655,-1.679106,
5,1.455062,-0.381497,-1.660777,
6,-0.067094,0.346649,-1.917147,
7,0.554554,-0.609612,-0.400267,


In [10]:
# more info on merging
# http://pandas.pydata.org/pandas-docs/stable/merging.html

### combining series using pd.Series() and np.where() logic

In [12]:
n = np.nan

ser1 = pd.Series([2,n,4,n,6,n],
                index=['Q','R','S','T','U','V'])

ser2 = pd.Series(np.arange(len(ser1)),dtype=np.float64,
                index=['Q','R','S','T','U','V'])
ser1

Q     2
R   NaN
S     4
T   NaN
U     6
V   NaN
dtype: float64

In [13]:
ser2

Q    0
R    1
S    2
T    3
U    4
V    5
dtype: float64

In [15]:
# make a new series
# for values where ser1 is null, use ser2 values
# use same index as ser1

pd.Series( np.where(pd.isnull(ser1),ser2,ser1) ,
         index=ser1.index)

Q    2
R    1
S    4
T    3
U    6
V    5
dtype: float64

### combining series using .combine_first()

In [20]:
n = np.nan

ser1 = pd.Series([2,n,4,n,6,n],
                index=['Q','R','S','T','U','V'])

ser2 = pd.Series(np.arange(len(ser1)),dtype=np.float64,
                index=['Q','R','S','T','U','V'])
ser1

Q     2
R   NaN
S     4
T   NaN
U     6
V   NaN
dtype: float64

In [21]:
ser2

Q    0
R    1
S    2
T    3
U    4
V    5
dtype: float64

In [23]:
# make a new series
# for values where ser1 is null, use ser2 values
# use same index as ser1

ser1.combine_first(ser2)

Q    2
R    1
S    4
T    3
U    6
V    5
dtype: float64

### combining dataframes using .combine_first()

In [25]:
# make 2 dataframes from dictionaries

n = np.nan

odds = pd.DataFrame({ 'X' : [1,n,3,n],
                      'Y' : [n,5,n,7],
                      'Z' : [n,9,n,11]})

evens = pd.DataFrame({ 'X' : [2,4,n,6,8],
                       'Y' : [n,10,12,14,16]})

odds

Unnamed: 0,X,Y,Z
0,1.0,,
1,,5.0,9.0
2,3.0,,
3,,7.0,11.0


In [26]:
evens

Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [28]:
# combine the two dataframes
# this will always pick the non-null option
# only nulls kept are in cells where both dfs have a null

odds.combine_first(evens) 

Unnamed: 0,X,Y,Z
0,1,,
1,4,5.0,9.0
2,3,12.0,
3,6,7.0,11.0
4,8,16.0,
