# Pandas (continues)

In [1]:
import pandas as pd
import numpy as np

## Catenating datasets

In [2]:
# axis = 0 catenates vertically and axis = 1 catenates horizontally. 
# pandas dataframe works similarly but row indices and the column names require extra attention.
# also the difference between np.concatenate & pd.concat
## helper for creating dataframes
def makedf(cols, ind):
    data = {c : [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [3]:
a=makedf("AB", [0,1])
a

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [4]:
b=makedf("AB", [2,3])
b

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [5]:
c=makedf("CD", [0,1])
c

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [6]:
d=makedf("BC", [2,3])
d

Unnamed: 0,B,C
2,B2,C2
3,B3,C3


In [7]:
## concat a and b -- works as expected
pd.concat([a,b])   # The default axis is 0

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [8]:
r=pd.concat([a,a])
r

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


In [9]:
r.loc[0,"A"]

0    A0
0    A0
Name: A, dtype: object

In [10]:
## indices should be unique, so that's not what we aim to. 
## here's a tool to chek duplicated indices
try:
    pd.concat([a,a], verify_integrity=True)
except ValueError as e:
    import sys
    print(e, file=sys.stderr)

Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [11]:
## automatic renumbering of rows
pd.concat([a,a], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


In [12]:
## hierarchical indexing -- multiple level indices
r2=pd.concat([a,a], keys=['first', 'second'])
r2

Unnamed: 0,Unnamed: 1,A,B
first,0,A0,B0
first,1,A1,B1
second,0,A0,B0
second,1,A1,B1


In [13]:
r2["A"]["first"][0] # works as numpy

'A0'

In [14]:
## it works similarly with horizontal catenation
pd.concat([a,c], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [15]:
## if you concatenate vertically two DataFrames that don't have the same columns
## this is called outer join, considers all the columns (the common ones and the different ones)
pd.concat([a,d], sort=False)    # sort option is used to silence a deprecation message

Unnamed: 0,A,B,C
0,A0,B0,
1,A1,B1,
2,,B2,C2
3,,B3,C3


In [16]:
## here is the same catenation but with inner join
pd.concat([a,d], join="inner")

Unnamed: 0,B
0,B0
1,B1
2,B2
3,B3
