# Combining and Reshaping

In [1]:
import numpy as np
import pandas as pd
import datetime

pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [2]:
s1 = pd.Series(np.arange(0, 3))
s2 = pd.Series(np.arange(5, 8))
s1

0    0
1    1
2    2
dtype: int64

In [3]:
s2

0    5
1    6
2    7
dtype: int64

In [4]:
pd.concat([s1, s2])

0    0
1    1
2    2
0    5
1    6
2    7
dtype: int64

In [6]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), 
                   columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                   columns=['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [7]:
df2

Unnamed: 0,a,b,c
0,9,10,11
1,12,13,14
2,15,16,17


In [8]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
0,9,10,11
1,12,13,14
2,15,16,17


In [11]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                   columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                   columns=['a', 'c', 'd'])
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [12]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [13]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [15]:
c = pd.concat([df1, df2], keys=['df1', 'df2'])
c

Unnamed: 0,Unnamed: 1,a,b,c,d
df1,0,0,1.0,2,
df1,1,3,4.0,5,
df1,2,6,7.0,8,
df2,0,9,,10,11.0
df2,1,12,,13,14.0
df2,2,15,,16,17.0


In [16]:
c.ix['df2']

Unnamed: 0,a,b,c,d
0,9,,10,11
1,12,,13,14
2,15,,16,17


In [17]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,a,b,c,a.1,c.1,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17


In [18]:
df3 = pd.DataFrame(np.arange(20, 26).reshape(3, 2),
                   columns=['a', 'd'],
                   index=[2, 3, 4])
df3

Unnamed: 0,a,d
2,20,21
3,22,23
4,24,25


In [20]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,a,b,c,a.1,d
0,0.0,1.0,2.0,,
1,3.0,4.0,5.0,,
2,6.0,7.0,8.0,20.0,21.0
3,,,,22.0,23.0
4,,,,24.0,25.0


In [21]:
pd.concat([df1, df3], axis=1, join='inner')

Unnamed: 0,a,b,c,a.1,d
2,6,7,8,20,21


In [22]:
df = pd.concat([df1, df2],
               axis=1,
               keys=['df1', 'df2'])
df

Unnamed: 0_level_0,df1,df1,df1,df2,df2,df2
Unnamed: 0_level_1,a,b,c,a,c,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17


In [24]:
df.ix[:, 'df2']

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [25]:
df1.append(df2)

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [27]:
df1.append(df2, ignore_index=True)

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
3,9,,10,11.0
4,12,,13,14.0
5,15,,16,17.0


## Merging and Joining

In [30]:
customers = {'CustomerID': [10, 11],
            'Name': ['Mike', 'Marcia'],
            'Address': ['Address for Mike',
                        'Address for Marcia']}
customers = pd.DataFrame(customers)
customers

Unnamed: 0,Address,CustomerID,Name
0,Address for Mike,10,Mike
1,Address for Marcia,11,Marcia


In [32]:
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [datetime.date(2014, 12, 1),
                        datetime.date(2014, 12, 1),
                        datetime.date(2014, 12, 1)]}
orders = pd.DataFrame(orders)
orders

Unnamed: 0,CustomerID,OrderDate
0,10,2014-12-01
1,11,2014-12-01
2,10,2014-12-01


In [33]:
customers.merge(orders)

Unnamed: 0,Address,CustomerID,Name,OrderDate
0,Address for Mike,10,Mike,2014-12-01
1,Address for Mike,10,Mike,2014-12-01
2,Address for Marcia,11,Marcia,2014-12-01


In [39]:
left_data = {'key1': ['a', 'b', 'c'],
             'key2': ['x', 'y', 'z'],
             'lval1': [0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'],
              'rval1': [6, 7, 8]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [40]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [42]:
left.merge(right)

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [43]:
left.merge(right, on='key1')

Unnamed: 0,key1,key2_x,lval1,key2_y,rval1
0,a,x,0,x,6
1,b,y,1,a,7
2,c,z,2,z,8


In [46]:
left.merge(right, on=['key1', 'key2'])

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [47]:
pd.merge(left, right, left_index=True, right_index=True)

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


In [51]:
left.merge(right, how='outer')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6.0
1,b,y,1.0,
2,c,z,2.0,8.0
3,b,a,,7.0


In [52]:
left.merge(right, how='left')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6.0
1,b,y,1,
2,c,z,2,8.0


In [53]:
left.merge(right, how='right')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6
1,c,z,2.0,8
2,b,a,,7


In [54]:
left.join(right, lsuffix='_left', rsuffix='_right')

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0,,,
1,b,y,1,a,x,6.0
2,c,z,2,b,a,7.0


In [56]:
left.join(right, lsuffix='_left', rsuffix='_right', how='inner')

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


## Pivoting

In [111]:
sensor_readings = pd.DataFrame([{'interval': 0, 'axis': 'X', 'reading': 0.0},
              {'interval': 0, 'axis': 'Y', 'reading': 0.5},
              {'interval': 0, 'axis': 'Z', 'reading': 1.0},
              {'interval': 1, 'axis': 'X', 'reading': 0.1},
              {'interval': 1, 'axis': 'Y', 'reading': 0.4},
              {'interval': 1, 'axis': 'Z', 'reading': 0.9},
              {'interval': 2, 'axis': 'X', 'reading': 0.2},
              {'interval': 2, 'axis': 'Y', 'reading': 0.3},
              {'interval': 2, 'axis': 'Z', 'reading': 0.8},
              {'interval': 3, 'axis': 'X', 'reading': 0.3},
              {'interval': 3, 'axis': 'Y', 'reading': 0.2},
              {'interval': 3, 'axis': 'Z', 'reading': 0.7},
             ])
sensor_readings

Unnamed: 0,axis,interval,reading
0,X,0,0.0
1,Y,0,0.5
2,Z,0,1.0
3,X,1,0.1
4,Y,1,0.4
...,...,...,...
7,Y,2,0.3
8,Z,2,0.8
9,X,3,0.3
10,Y,3,0.2


In [112]:
# sensor_readings.to_csv("data_accel.csv", index = False)

In [113]:
sensor_readings = pd.read_csv("data_accel.csv")
sensor_readings

Unnamed: 0,axis,interval,reading
0,X,0,0.0
1,Y,0,0.5
2,Z,0,1.0
3,X,1,0.1
4,Y,1,0.4
...,...,...,...
7,Y,2,0.3
8,Z,2,0.8
9,X,3,0.3
10,Y,3,0.2


In [114]:
sensor_readings[sensor_readings['axis'] =='X']

Unnamed: 0,axis,interval,reading
0,X,0,0.0
3,X,1,0.1
6,X,2,0.2
9,X,3,0.3


## pivot

In [115]:
sensor_readings.pivot(index='interval',
                       columns='axis',
                      values='reading')

axis,X,Y,Z
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.5,1.0
1,0.1,0.4,0.9
2,0.2,0.3,0.8
3,0.3,0.2,0.7


# stacking and unstacking

In [116]:
df = pd.DataFrame({'a': [2, 1]}, # change to match book output
                  index={'one', 'two'})
df

Unnamed: 0,a
one,2
two,1


In [117]:
# stacking: move one level of column index into a new level of rows
stacked1 = df.stack()
stacked1

one  a    2
two  a    1
dtype: int64

In [118]:
stacked1[('one', 'a')]

2

In [119]:
df = pd.DataFrame({'a': [1, 2],
                   'b': [3, 4]},
#                    index={'two', 'one'})
                   index=['two', 'one']) # changed to list to match order in book
df

Unnamed: 0,a,b
two,1,3
one,2,4


In [120]:
stacked2 = df.stack()
stacked2

two  a    1
     b    3
one  a    2
     b    4
dtype: int64

In [121]:
stacked2[('one', 'b')]

4

In [122]:
user1 = sensor_readings.copy()
user2 = sensor_readings.copy()

user1['who'] = 'Mike'
user2['who'] = 'Mikael'

user2['reading'] *= 100

multi_user_sensor_data = pd.concat([user1, user2]) \
                        .set_index(['who', 'interval', 'axis'])
multi_user_sensor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reading
who,interval,axis,Unnamed: 3_level_1
Mike,0,X,0.0
Mike,0,Y,0.5
Mike,0,Z,1.0
Mike,1,X,0.1
Mike,1,Y,0.4
...,...,...,...
Mikael,2,Y,30.0
Mikael,2,Z,80.0
Mikael,3,X,30.0
Mikael,3,Y,20.0


In [123]:
multi_user_sensor_data.ix['Mike']

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
interval,axis,Unnamed: 2_level_1
0,X,0.0
0,Y,0.5
0,Z,1.0
1,X,0.1
1,Y,0.4
...,...,...
2,Y,0.3
2,Z,0.8
3,X,0.3
3,Y,0.2


In [124]:
multi_user_sensor_data.xs(1, level='interval')

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
who,axis,Unnamed: 2_level_1
Mike,X,0.1
Mike,Y,0.4
Mike,Z,0.9
Mikael,X,10.0
Mikael,Y,40.0
Mikael,Z,90.0


In [126]:
multi_user_sensor_data.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading,reading
Unnamed: 0_level_1,axis,X,Y,Z
who,interval,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Mikael,0,0.0,50.0,100.0
Mikael,1,10.0,40.0,90.0
Mikael,2,20.0,30.0,80.0
Mikael,3,30.0,20.0,70.0
Mike,0,0.0,0.5,1.0
Mike,1,0.1,0.4,0.9
Mike,2,0.2,0.3,0.8
Mike,3,0.3,0.2,0.7


In [128]:
multi_user_sensor_data.unstack(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading
Unnamed: 0_level_1,who,Mikael,Mike
interval,axis,Unnamed: 2_level_2,Unnamed: 3_level_2
0,X,0,0.0
0,Y,50,0.5
0,Z,100,1.0
1,X,10,0.1
1,Y,40,0.4
...,...,...,...
2,Y,30,0.3
2,Z,80,0.8
3,X,30,0.3
3,Y,20,0.2


In [129]:
unstacked = multi_user_sensor_data.unstack(['who', 'axis'])
unstacked

Unnamed: 0_level_0,reading,reading,reading,reading,reading,reading
who,Mike,Mike,Mike,Mikael,Mikael,Mikael
axis,X,Y,Z,X,Y,Z
interval,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
0,0.0,0.5,1.0,0,50,100
1,0.1,0.4,0.9,10,40,90
2,0.2,0.3,0.8,20,30,80
3,0.3,0.2,0.7,30,20,70


In [130]:
unstacked.stack(level='who')

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading,reading
Unnamed: 0_level_1,axis,X,Y,Z
interval,who,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,Mikael,0.0,50.0,100.0
0,Mike,0.0,0.5,1.0
1,Mikael,10.0,40.0,90.0
1,Mike,0.1,0.4,0.9
2,Mikael,20.0,30.0,80.0
2,Mike,0.2,0.3,0.8
3,Mikael,30.0,20.0,70.0
3,Mike,0.3,0.2,0.7


## Melting

In [133]:
data = pd.DataFrame({'Name': ['Mike', 'Mikael'],
                     'Height': [6.1, 6.0],
                     'Weight': [220, 185]})
data

Unnamed: 0,Height,Name,Weight
0,6.1,Mike,220
1,6.0,Mikael,185


In [134]:
pd.melt(data,
        id_vars=['Name'],
        value_vars=['Height', 'Weight'])

Unnamed: 0,Name,variable,value
0,Mike,Height,6.1
1,Mikael,Height,6.0
2,Mike,Weight,220.0
3,Mikael,Weight,185.0


In [135]:
import timeit

In [None]:
t = timeit.Timer("stacked1[('one', 'a')]",
                 "from __main__ import stacked1, df")
r1 = timeit.timeit(lambda: stacked1.loc[('one', 'a')],
                   number=10000)
r2 = timeit.timeit(lambda: df.loc['one']['a'],
                   number=10000)
r3 = timeit.timeit(lambda: df.iloc[1, 0], 
                   number=10000)

r1, r2, r3