# PANDAS IN DEPTH - DATA MANIPULATION

## Merging

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                        'price': [12.33,11.44,33.21,13.23,33.62]})
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [3]:
frame2 = pd.DataFrame( {'id': ['pencil','pencil','ball','pen'],
                        'color':['white','red','red','black']})
frame2

Unnamed: 0,color,id
0,white,pencil
1,red,pencil
2,red,ball
3,black,pen


In [4]:
pd.merge(frame1,frame2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [5]:
frame1 = pd.DataFrame( {'id': ['ball','pencil','pen',',mug','ashtray'],
                        'color': ['white','red','red','black','green'],
                        'brand': ['OMG','ABC','ABC','POD','PPOD']})
frame1

Unnamed: 0,brand,color,id
0,OMG,white,ball
1,ABC,red,pencil
2,ABC,red,pen
3,POD,black,",mug"
4,PPOD,green,ashtray


In [6]:
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],
                        'brand': ['OMG','POD','ABC','POD']})
frame2

Unnamed: 0,brand,id
0,OMG,pencil
1,POD,pencil
2,ABC,ball
3,POD,pen


In [7]:
pd.merge(frame1,frame2)

Unnamed: 0,brand,color,id


In [8]:
pd.merge(frame1, frame2, on='id')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD


In [9]:
pd.merge(frame1, frame2, on='brand')

Unnamed: 0,brand,color,id_x,id_y
0,OMG,white,ball,pencil
1,ABC,red,pencil,ball
2,ABC,red,pen,ball
3,POD,black,",mug",pencil
4,POD,black,",mug",pen


In [10]:
frame2.columns = ['brand','sid']
frame2

Unnamed: 0,brand,sid
0,OMG,pencil
1,POD,pencil
2,ABC,ball
3,POD,pen


In [11]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,brand_x,color,id,brand_y,sid
0,OMG,white,ball,ABC,ball
1,ABC,red,pencil,OMG,pencil
2,ABC,red,pencil,POD,pencil
3,ABC,red,pen,POD,pen


In [12]:
frame2.columns = ['brand','id']
pd.merge(frame1,frame2,on='id')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD


In [13]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD
4,POD,black,",mug",
5,PPOD,green,ashtray,


In [14]:
pd.merge(frame1,frame2,on='id',how='left')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD
4,POD,black,",mug",
5,PPOD,green,ashtray,


In [15]:
pd.merge(frame1,frame2,on='id',how='right')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD


In [16]:
pd.merge(frame1,frame2,on=['id','brand'], how='outer')

Unnamed: 0,brand,color,id
0,OMG,white,ball
1,ABC,red,pencil
2,ABC,red,pen
3,POD,black,",mug"
4,PPOD,green,ashtray
5,OMG,,pencil
6,POD,,pencil
7,ABC,,ball
8,POD,,pen


### Merging on Index

In [17]:
pd.merge(frame1,frame2,right_index=True, left_index=True)

Unnamed: 0,brand_x,color,id_x,brand_y,id_y
0,OMG,white,ball,OMG,pencil
1,ABC,red,pencil,POD,pencil
2,ABC,red,pen,ABC,ball
3,POD,black,",mug",POD,pen


In [18]:
frame1.join(frame2)

ValueError: columns overlap but no suffix specified: Index(['brand', 'id'], dtype='object')

In [None]:
frame2.columns = ['brand2','id2']
frame1.join(frame2)

## Concatenating

In [None]:
array1 = np.arange(9).reshape((3,3))
array1

In [None]:
array2 = np.arange(9).reshape((3,3))+6
array2

In [None]:
np.concatenate([array1,array2],axis=1)

In [19]:
np.concatenate([array1,array2],axis=0)

NameError: name 'array1' is not defined

In [20]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser1

1    0.953608
2    0.929539
3    0.036994
4    0.010650
dtype: float64

In [21]:
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])
ser2

5    0.200771
6    0.709060
7    0.813766
8    0.218998
dtype: float64

In [22]:
pd.concat([ser1,ser2])

1    0.953608
2    0.929539
3    0.036994
4    0.010650
5    0.200771
6    0.709060
7    0.813766
8    0.218998
dtype: float64

In [23]:
pd.concat([ser1,ser2], axis=1)

Unnamed: 0,0,1
1,0.953608,
2,0.929539,
3,0.036994,
4,0.01065,
5,,0.200771
6,,0.70906
7,,0.813766
8,,0.218998


In [24]:
pd.concat([ser1,ser2], axis=1, join='inner')

Unnamed: 0,0,1


In [25]:
pd.concat([ser1,ser2], keys=[1,2])

1  1    0.953608
   2    0.929539
   3    0.036994
   4    0.010650
2  5    0.200771
   6    0.709060
   7    0.813766
   8    0.218998
dtype: float64

In [26]:
pd.concat([ser1,ser2], axis=1, keys=[1,2])

Unnamed: 0,1,2
1,0.953608,
2,0.929539,
3,0.036994,
4,0.01065,
5,,0.200771
6,,0.70906
7,,0.813766
8,,0.218998


In [27]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.231057,0.024329,0.843888
2,0.72748,0.296619,0.367309
3,0.282516,0.524227,0.462
4,0.078044,0.751505,0.832853
5,0.843225,0.945914,0.141331
6,0.189217,0.799631,0.308749


In [28]:
pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.231057,0.024329,0.843888,,,
2,0.72748,0.296619,0.367309,,,
3,0.282516,0.524227,0.462,,,
4,,,,0.078044,0.751505,0.832853
5,,,,0.843225,0.945914,0.141331
6,,,,0.189217,0.799631,0.308749


### Combining

In [29]:
ser1 = pd.Series(np.random.rand(5), index=[1,2,3,4,5])
ser1

1    0.075815
2    0.332282
3    0.884463
4    0.518336
5    0.089025
dtype: float64

In [30]:
ser2 = pd.Series(np.random.rand(4), index=[2,4,5,6])
ser2

2    0.315847
4    0.275937
5    0.352538
6    0.865549
dtype: float64

In [31]:
ser1.combine_first(ser2)

1    0.075815
2    0.332282
3    0.884463
4    0.518336
5    0.089025
6    0.865549
dtype: float64

In [32]:
ser2.combine_first(ser1)

1    0.075815
2    0.315847
3    0.884463
4    0.275937
5    0.352538
6    0.865549
dtype: float64

In [33]:
ser1[:3].combine_first(ser2[:3])

1    0.075815
2    0.332282
3    0.884463
4    0.275937
5    0.352538
dtype: float64

## Pivoting

### Pivoting with Hierarchical Indexing

In [34]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                     index=['white','black','red'],
                     columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [35]:
ser = frame1.stack()
ser

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int32

In [36]:
ser.unstack()

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [37]:
ser.unstack(0)

Unnamed: 0,white,black,red
ball,0,3,6
pen,1,4,7
pencil,2,5,8


### Pivoting from "Long" to "Wide" Format

In [38]:
longframe = pd.DataFrame({ 'color':['white','white','white',
                                    'red','red','red',
                                    'black','black','black'],
                           'item':['ball','pen','mug',
                                   'ball','pen','mug',
                                   'ball','pen','mug'],
                           'value': np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.896313
1,white,pen,0.344864
2,white,mug,0.101891
3,red,ball,0.697267
4,red,pen,0.852835
5,red,mug,0.145385
6,black,ball,0.738799
7,black,pen,0.78387
8,black,mug,0.017153


In [39]:
wideframe = longframe.pivot('color','item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.738799,0.017153,0.78387
red,0.697267,0.145385,0.852835
white,0.896313,0.101891,0.344864


### Removing

In [40]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                       index=['white','black','red'],
                       columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [41]:
del frame1['ball']
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [42]:
frame1.drop('white')

Unnamed: 0,pen,pencil
black,4,5
red,7,8


## Data Transformation

### Removing Duplicates

In [43]:
dframe = pd.DataFrame({ 'color': ['white','white','red','red','white'],
                        'value': [2,1,3,3,2]})
dframe

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [44]:
dframe.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [45]:
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


### Replacing Values via Mapping

In [46]:
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                       'color':['white','rosso','verde','black','yellow'],
                       'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [47]:
newcolors = {
    'rosso': 'red',
    'verde': 'green'
}

In [48]:
frame.replace(newcolors)

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [49]:
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [50]:
ser.replace(np.nan,0)

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

### Adding Values via Mapping

In [51]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,color,item
0,white,ball
1,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


In [52]:
price = {
    'ball': 5.56,
    'mug': 4.20,
    'bottle': 1.30,
    'scissors': 3.41,
    'pen': 1.30,
    'pencil': 0.56,
    'ashtray': 2.75
}

In [53]:
frame['price'] = frame['item'].map(price)
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


### Rename the Indexes of the Axes

In [54]:
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [55]:
reindex = {
    0: 'first',
    1: 'second',
    2: 'third',
    3: 'fourth',
    4: 'fifth'
}
frame.rename(reindex)

Unnamed: 0,color,item,price
first,white,ball,5.56
second,red,mug,4.2
third,green,pen,1.3
fourth,black,pencil,0.56
fifth,yellow,ashtray,2.75


In [56]:
recolumn = {
    'item': 'object',
    'prince': 'value'
}
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,color,object,price
first,white,ball,5.56
second,red,mug,4.2
third,green,pen,1.3
fourth,black,pencil,0.56
fifth,yellow,ashtray,2.75


In [57]:
frame.rename(index={1:'first'}, columns={'item':'object'})

Unnamed: 0,color,object,price
0,white,ball,5.56
first,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [58]:
frame.rename(columns={'item':'object'}, inplace=True)

In [59]:
frame

Unnamed: 0,color,object,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


## Discretization and Binning

In [60]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]
bins = [0,25,50,75,100]
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [61]:
cat.categories

IntervalIndex([(0, 25], (25, 50], (50, 75], (75, 100]]
              closed='right',
              dtype='interval[int64]')

In [62]:
cat.codes

array([0, 1, 2, 2, 1, 3, 3, 0, 0, 2, 2, 1, 3, 0, 1, 3, 3], dtype=int8)

In [63]:
pd.value_counts(cat)

(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [64]:
bin_names = ['unlikely','less likely','likely','highly likely']
pd.cut(results, bins, labels=bin_names)

[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less likely < likely < highly likely]

In [65]:
pd.cut(results, 5)

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]
Length: 17
Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] < (79.8, 99.0]]

In [66]:
quintiles = pd.qcut(results, 5)
quintiles

[(2.999, 24.0], (24.0, 46.0], (62.6, 87.0], (46.0, 62.6], (24.0, 46.0], ..., (62.6, 87.0], (2.999, 24.0], (46.0, 62.6], (87.0, 99.0], (62.6, 87.0]]
Length: 17
Categories (5, interval[float64]): [(2.999, 24.0] < (24.0, 46.0] < (46.0, 62.6] < (62.6, 87.0] < (87.0, 99.0]]

In [67]:
pd.value_counts(quintiles)

(62.6, 87.0]     4
(2.999, 24.0]    4
(87.0, 99.0]     3
(46.0, 62.6]     3
(24.0, 46.0]     3
dtype: int64

### Detecting and Filtering Outliers

In [68]:
randframe = pd.DataFrame(np.random.randn(1000,3))
randframe.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,-0.036163,0.037203,0.018722
std,1.038703,0.986338,1.011587
min,-3.591217,-3.816239,-3.586733
25%,-0.729458,-0.581396,-0.665261
50%,-0.025864,0.005155,-0.002774
75%,0.674396,0.706958,0.731404
max,3.115554,2.899073,3.4254


In [69]:
randframe.std()

0    1.038703
1    0.986338
2    1.011587
dtype: float64

In [70]:
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

Unnamed: 0,0,1,2
87,-2.106846,-3.408329,-0.067435
129,-3.591217,0.791474,0.243038
133,1.149396,-3.816239,0.328653
717,0.434665,-1.248411,-3.586733
726,1.68233,1.252479,-3.090042
955,0.272374,2.224856,3.4254


## Permutation

In [71]:
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
nframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [72]:
new_order = np.random.permutation(5)
new_order

array([2, 3, 0, 4, 1])

In [73]:
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
2,10,11,12,13,14
3,15,16,17,18,19
0,0,1,2,3,4
4,20,21,22,23,24
1,5,6,7,8,9


In [74]:
new_order = [3,4,2]
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


### Random Sampling

In [75]:
sample = np.random.randint(0, len(nframe), size=3)
sample

array([3, 3, 3])

In [76]:
nframe.take(sample)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
3,15,16,17,18,19
3,15,16,17,18,19


## String Manipulation

### Built-in Methods for Manipulation of Strings

In [77]:
text = '16 Bolton Avenue , Boston'
text.split(',')

['16 Bolton Avenue ', ' Boston']

In [78]:
tokens = [s.strip() for s in text.split(',')]
tokens

['16 Bolton Avenue', 'Boston']

In [79]:
address, city = [s.strip() for s in text.split(',')]
address

'16 Bolton Avenue'

In [80]:
city

'Boston'

In [81]:
address + ',' + city

'16 Bolton Avenue,Boston'

In [82]:
strings = ['A+', 'A', 'A-', 'B', 'BB', 'BBB', 'C+']
';'.join(strings)

'A+;A;A-;B;BB;BBB;C+'

In [83]:
'Boston' in text

True

In [84]:
text.index('Boston')

19

In [85]:
text.find('Boston')

19

In [86]:
text.index('New York')

ValueError: substring not found

In [None]:
text.find('New York')

In [None]:
text.count('e')

In [None]:
text.count('Avenue')

In [87]:
text.replace('Avenue','Street')

'16 Bolton Street , Boston'

In [88]:
text.replace('1','')

'6 Bolton Avenue , Boston'

### Regular Expressions

In [89]:
import re

In [90]:
text = "This is      an\t odd   \n text!"
re.split('\s+', text)

['This', 'is', 'an', 'odd', 'text!']

In [91]:
regex = re.compile('\s+')

In [92]:
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [93]:
text = 'This is my address: 16 Bolton Avenue, Boston'
re.findall('A\w+', text)

['Avenue']

In [94]:
re.findall('[A,a]\w+', text)

['address', 'Avenue']

In [95]:
re.search('[A,a]\w+', text)

<_sre.SRE_Match object; span=(11, 18), match='address'>

In [96]:
search = re.search('[A,a]\w+', text)
search.start()

11

In [97]:
search.end()

18

In [98]:
text[search.start():search.end()]

'address'

In [99]:
re.match('[A,a]\w+', text)

In [100]:
re.match('T\w+', text)

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [101]:
match = re.match('T\w+', text)
text[match.start():match.end()]

'This'

## Data Aggregation

### A Practical Example

In [102]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'object': ['pen','pencil','pencil','ashtray','pen'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [103]:
group = frame['price1'].groupby(frame['color'])
group

<pandas.core.groupby.SeriesGroupBy object at 0x0000025216B020B8>

In [104]:
group.groups

{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [105]:
group.mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [106]:
group.sum()

color
green    4.05
red      4.76
white    5.56
Name: price1, dtype: float64

### Hierarchical Grouping

In [107]:
ggroup = frame['price1'].groupby([frame['color'],frame['object']])
ggroup.groups

{('green', 'pen'): Int64Index([4], dtype='int64'),
 ('green', 'pencil'): Int64Index([2], dtype='int64'),
 ('red', 'ashtray'): Int64Index([3], dtype='int64'),
 ('red', 'pencil'): Int64Index([1], dtype='int64'),
 ('white', 'pen'): Int64Index([0], dtype='int64')}

In [108]:
ggroup.sum()

color  object 
green  pen        2.75
       pencil     1.30
red    ashtray    0.56
       pencil     4.20
white  pen        5.56
Name: price1, dtype: float64

In [109]:
frame[['price1','price2']].groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [110]:
frame.groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


## Group Iteration

In [111]:
for name, group in frame.groupby('color'):
    print(name)
    print(group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen    5.56    4.75


### Chain of Transformations

In [112]:
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

pandas.core.series.Series

In [113]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

pandas.core.frame.DataFrame

In [114]:
frame['price1'].groupby(frame['color']).mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [115]:
frame.groupby(frame['color'])['price1'].mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [116]:
(frame.groupby(frame['color']).mean())['price1']

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [117]:
means = frame.groupby('color').mean().add_prefix('mean_')
means

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


### Functions on Groups

In [118]:
group = frame.groupby('color')
group['price1'].quantile(0.6)

color
green    2.170
red      2.744
white    5.560
Name: price1, dtype: float64

In [119]:
def range(series):
    return series.max() - series.min()
group['price1'].agg(range)

color
green    1.45
red      3.64
white    0.00
Name: price1, dtype: float64

In [120]:
group.agg(range)

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.45,1.55
red,3.64,3.37
white,0.0,0.0


In [121]:
group['price1'].agg(['mean','std',range])

Unnamed: 0_level_0,mean,std,range
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,2.025,1.025305,1.45
red,2.38,2.573869,3.64
white,5.56,,0.0


## Advanced Data Aggregation

In [122]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,price1,price2
0,white,5.56,4.75
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [123]:
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

Unnamed: 0_level_0,tot_price1,tot_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,4.05,4.75
red,4.76,4.87
white,5.56,4.75


In [124]:
pd.merge(frame, sums, left_on='color', right_index=True)

Unnamed: 0,color,price1,price2,tot_price1,tot_price2
0,white,5.56,4.75,5.56,4.75
1,red,4.2,4.12,4.76,4.87
3,red,0.56,0.75,4.76,4.87
2,green,1.3,1.6,4.05,4.75
4,green,2.75,3.15,4.05,4.75


In [125]:
frame.groupby('color').transform(np.sum).add_prefix('tot_')

Unnamed: 0,tot_price1,tot_price2
0,5.56,4.75
1,4.76,4.87
2,4.05,4.75
3,4.76,4.87
4,4.05,4.75


In [126]:
frame = pd.DataFrame({ 'color': ['white','black','white','white','black','black'],
                       'status': ['up','up','down','down','down','up'],
                       'price1': [12.33,14.55,22.34,27.84,23.40,18.33],
                       'price2': [11.23,31.80,29.99,31.18,18.25,22.44]})
frame

Unnamed: 0,color,price1,price2,status
0,white,12.33,11.23,up
1,black,14.55,31.8,up
2,white,22.34,29.99,down
3,white,27.84,31.18,down
4,black,23.4,18.25,down
5,black,18.33,22.44,up


In [127]:
frame.groupby(['color','status']).apply( lambda x: x.max())

Unnamed: 0_level_0,Unnamed: 1_level_0,color,price1,price2,status
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
black,down,black,23.4,18.25,down
black,up,black,18.33,31.8,up
white,down,white,27.84,31.18,down
white,up,white,12.33,11.23,up


In [128]:
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,color,price1,price2,status
first,white,12.33,11.23,up
second,black,14.55,31.8,up
third,white,22.34,29.99,down
fourth,white,27.84,31.18,down
fifth,black,23.4,18.25,down
5,black,18.33,22.44,up


In [129]:
temp = pd.date_range('1/1/2015', periods=10, freq= 'H')
temp

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [130]:
timeseries = pd.Series(np.random.rand(10), index=temp)
timeseries

2015-01-01 00:00:00    0.317051
2015-01-01 01:00:00    0.628468
2015-01-01 02:00:00    0.829405
2015-01-01 03:00:00    0.792059
2015-01-01 04:00:00    0.486475
2015-01-01 05:00:00    0.707027
2015-01-01 06:00:00    0.293156
2015-01-01 07:00:00    0.091072
2015-01-01 08:00:00    0.146105
2015-01-01 09:00:00    0.500388
Freq: H, dtype: float64

In [131]:
timetable = pd.DataFrame( {'date': temp, 'value1': np.random.rand(10),
                                      'value2': np.random.rand(10)})
timetable

Unnamed: 0,date,value1,value2
0,2015-01-01 00:00:00,0.125229,0.995517
1,2015-01-01 01:00:00,0.597289,0.160828
2,2015-01-01 02:00:00,0.231104,0.076982
3,2015-01-01 03:00:00,0.86294,0.270581
4,2015-01-01 04:00:00,0.534056,0.306486
5,2015-01-01 05:00:00,0.16204,0.979835
6,2015-01-01 06:00:00,0.400413,0.486397
7,2015-01-01 07:00:00,0.157052,0.246959
8,2015-01-01 08:00:00,0.835632,0.572664
9,2015-01-01 09:00:00,0.812283,0.388435


In [132]:
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable

Unnamed: 0,date,value1,value2,cat
0,2015-01-01 00:00:00,0.125229,0.995517,up
1,2015-01-01 01:00:00,0.597289,0.160828,down
2,2015-01-01 02:00:00,0.231104,0.076982,left
3,2015-01-01 03:00:00,0.86294,0.270581,left
4,2015-01-01 04:00:00,0.534056,0.306486,up
5,2015-01-01 05:00:00,0.16204,0.979835,up
6,2015-01-01 06:00:00,0.400413,0.486397,down
7,2015-01-01 07:00:00,0.157052,0.246959,right
8,2015-01-01 08:00:00,0.835632,0.572664,right
9,2015-01-01 09:00:00,0.812283,0.388435,up
