## Session 15 (pandas part 5)

### Combining DataFrames

In [32]:
import pandas as pd
import numpy as np

In [7]:
# with code below we can display items horizontally in notebook
from IPython.display import HTML
HTML('<style>.output{flex-direction:row;flex-wrap:wrap}</style>')

In [8]:
a = 1
b = 2

display(a)
display(b)

1

2

### combining pandas series

In [12]:
d1 = pd.Series([4, 5, 9, 2])
d2 = pd.Series([6, 6, 3, 3])
d = pd.concat([d1, d2])

display(d1)
display(d2)
display(d)

0    4
1    5
2    9
3    2
dtype: int64

0    6
1    6
2    3
3    3
dtype: int64

0    4
1    5
2    9
3    2
0    6
1    6
2    3
3    3
dtype: int64

In [13]:
d2 = pd.concat([d1, d2], axis=1)
d2

Unnamed: 0,0,1
0,4,6
1,5,6
2,9,3
3,2,3


### combining pandas dataframes

In [15]:
pd.concat([d2, d2]) # axis is default 0

Unnamed: 0,0,1
0,4,6
1,5,6
2,9,3
3,2,3
0,4,6
1,5,6
2,9,3
3,2,3


In [16]:
pd.concat([d2, d2], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,4,6,4,6
1,5,6,5,6
2,9,3,9,3
3,2,3,2,3


In [17]:
pd.concat([d2, d2, d2], axis=1)

Unnamed: 0,0,1,0.1,1.1,0.2,1.2
0,4,6,4,6,4,6
1,5,6,5,6,5,6
2,9,3,9,3,9,3
3,2,3,2,3,2,3


In [18]:
# as you see from above combining results, we may have duplicate indices;
# below we will see how to handle duplicate indices.

In [23]:
# pd.concat([d2, d2], verify_integrity=True) --> raise error
# this parameter will prevent from having duplicate indices

In [25]:
pd.concat([d2, d2], ignore_index=True)
# this parameter will override current indices and starts indexing from 0 to len of object

Unnamed: 0,0,1
0,4,6
1,5,6
2,9,3
3,2,3
4,4,6
5,5,6
6,9,3
7,2,3


In [26]:
# now what if we want to have our indices kept ?
# we can use multi-index

In [30]:
d1 = pd.Series([4, 5, 9, 2], index=['a', 'b', 'c', 'd'])
d2 = pd.Series([6, 6, 3, 3], index=['a', 'b', 'c', 'd'])
d = pd.concat([d1, d2], keys=['d1', 'd2']) # note that keys names are arbitrary
d

d1  a    4
    b    5
    c    9
    d    2
d2  a    6
    b    6
    c    3
    d    3
dtype: int64

In [31]:
# concat dataframes with different columns

In [36]:
df1 = pd.DataFrame(np.random.randint(1, 10, (5, 4)), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randint(1, 10, (5, 4)), columns=['D', 'B', 'E', 'F'])
display(df1)
display(df2)
display(pd.concat([df1, df2]))

Unnamed: 0,A,B,C,D
0,9,3,2,5
1,7,4,5,4
2,7,4,3,5
3,3,5,5,9
4,4,5,2,3


Unnamed: 0,D,B,E,F
0,6,4,1,9
1,6,5,6,3
2,2,4,9,1
3,1,7,5,7
4,2,6,6,7


Unnamed: 0,A,B,C,D,E,F
0,9.0,3,2.0,5,,
1,7.0,4,5.0,4,,
2,7.0,4,3.0,5,,
3,3.0,5,5.0,9,,
4,4.0,5,2.0,3,,
0,,4,,6,1.0,9.0
1,,5,,6,6.0,3.0
2,,4,,2,9.0,1.0
3,,7,,1,5.0,7.0
4,,6,,2,6.0,7.0


In [39]:
# if we want to exclude columns that have missing values, one way is this :
display(pd.concat([df1, df2], join='inner'))

Unnamed: 0,B,D
0,3,5
1,4,4
2,4,5
3,5,9
4,5,3
0,4,6
1,5,6
2,4,2
3,7,1
4,6,2


In [41]:
df1.append(df2) # append method is also available for joining

  df1.append(df2) # append method is also available for joining


Unnamed: 0,A,B,C,D,E,F
0,9.0,3,2.0,5,,
1,7.0,4,5.0,4,,
2,7.0,4,3.0,5,,
3,3.0,5,5.0,9,,
4,4.0,5,2.0,3,,
0,,4,,6,1.0,9.0
1,,5,,6,6.0,3.0
2,,4,,2,9.0,1.0
3,,7,,1,5.0,7.0
4,,6,,2,6.0,7.0


### dataframe relations

In [49]:
df1 = pd.DataFrame({'person': ['babak', 'sara', 'reza', 'saeed'], 
                   'age': [40, 30, 15, 19]})
df2 = pd.DataFrame({'person': ['saeed', 'sara', 'reza', 'babak'], 
                   'weight': [70, 55, 60, 86]})
display(df1)
display(df2)

display(pd.concat([df1, df2]))

Unnamed: 0,person,age
0,babak,40
1,sara,30
2,reza,15
3,saeed,19


Unnamed: 0,person,weight
0,saeed,70
1,sara,55
2,reza,60
3,babak,86


Unnamed: 0,person,age,weight
0,babak,40.0,
1,sara,30.0,
2,reza,15.0,
3,saeed,19.0,
0,saeed,,70.0
1,sara,,55.0
2,reza,,60.0
3,babak,,86.0


In [51]:
# one-to-one join

In [52]:
display(df1)
display(df2)

display(pd.merge(df1, df2))

Unnamed: 0,person,age
0,babak,40
1,sara,30
2,reza,15
3,saeed,19


Unnamed: 0,person,weight
0,saeed,70
1,sara,55
2,reza,60
3,babak,86


Unnamed: 0,person,age,weight
0,babak,40,86
1,sara,30,55
2,reza,15,60
3,saeed,19,70
