# PANDAS IN DEPTH - DATA MANIPULATION

## Merging

In [106]:
import numpy as np
import pandas as pd

In [107]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                        'price': [12.33,11.44,33.21,13.23,33.62]})
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [108]:
frame2 = pd.DataFrame( {'id': ['pencil','pencil','ball','pen'],
                        'color':['white','red','red','black']})
frame2

Unnamed: 0,id,color
0,pencil,white
1,pencil,red
2,ball,red
3,pen,black


In [109]:
pd.merge(frame1,frame2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [110]:
frame1 = pd.DataFrame( {'id': ['ball','pencil','pen',',mug','ashtray'],
                        'color': ['white','red','red','black','green'],
                        'brand': ['OMG','ABC','ABC','POD','PPOD']})
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,",mug",black,POD
4,ashtray,green,PPOD


In [111]:
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],
                        'brand': ['OMG','POD','ABC','POD']})
frame2

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [112]:
pd.merge(frame1,frame2)

Unnamed: 0,id,color,brand


In [113]:
pd.merge(frame1, frame2, on='id')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [114]:
pd.merge(frame1, frame2, on='brand')

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,pencil
1,pencil,red,ABC,ball
2,pen,red,ABC,ball
3,",mug",black,POD,pencil
4,",mug",black,POD,pen


In [115]:
frame2.columns = ['brand','sid']
frame2

Unnamed: 0,brand,sid
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [116]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,id,color,brand_x,brand_y,sid


In [117]:
frame2.columns = ['brand','id']
pd.merge(frame1,frame2,on='id')

Unnamed: 0,id,color,brand_x,brand_y


In [118]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,
2,pen,red,ABC,
3,",mug",black,POD,
4,ashtray,green,PPOD,
5,OMG,,,pencil
6,POD,,,pencil
7,POD,,,pen
8,ABC,,,ball


In [119]:
pd.merge(frame1,frame2,on='id',how='left')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,
2,pen,red,ABC,
3,",mug",black,POD,
4,ashtray,green,PPOD,


In [120]:
pd.merge(frame1,frame2,on='id',how='right')

Unnamed: 0,id,color,brand_x,brand_y
0,OMG,,,pencil
1,POD,,,pencil
2,POD,,,pen
3,ABC,,,ball


In [121]:
pd.merge(frame1,frame2,on=['id','brand'], how='outer')

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,",mug",black,POD
4,ashtray,green,PPOD
5,OMG,,pencil
6,POD,,pencil
7,ABC,,ball
8,POD,,pen


## Concatenating

In [122]:
array1 = np.arange(9).reshape((3,3))
array1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [123]:
array2 = np.arange(9).reshape((3,3))+6
array2

array([[ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [124]:
np.concatenate([array1,array2],axis=1)

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11],
       [ 6,  7,  8, 12, 13, 14]])

In [125]:
np.concatenate([array1,array2],axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [126]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser1

1    0.301458
2    0.618564
3    0.767719
4    0.424583
dtype: float64

In [127]:
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])
ser2

5    0.455294
6    0.741248
7    0.622215
8    0.064001
dtype: float64

In [128]:
pd.concat([ser1,ser2])

1    0.301458
2    0.618564
3    0.767719
4    0.424583
5    0.455294
6    0.741248
7    0.622215
8    0.064001
dtype: float64

In [129]:
pd.concat([ser1,ser2], axis=1)

Unnamed: 0,0,1
1,0.301458,
2,0.618564,
3,0.767719,
4,0.424583,
5,,0.455294
6,,0.741248
7,,0.622215
8,,0.064001


In [130]:
pd.concat([ser1,ser2], axis=1, join='inner')

Unnamed: 0,0,1


In [131]:
pd.concat([ser1,ser2], keys=[1,2])

1  1    0.301458
   2    0.618564
   3    0.767719
   4    0.424583
2  5    0.455294
   6    0.741248
   7    0.622215
   8    0.064001
dtype: float64

In [132]:
pd.concat([ser1,ser2], axis=1, keys=[1,2])

Unnamed: 0,1,2
1,0.301458,
2,0.618564,
3,0.767719,
4,0.424583,
5,,0.455294
6,,0.741248
7,,0.622215
8,,0.064001


In [133]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.119462,0.988917,0.053551
2,0.823311,0.367955,0.625359
3,0.308515,0.116837,0.143324
4,0.221465,0.552405,0.411732
5,0.17467,0.108201,0.542668
6,0.575452,0.106296,0.389967


In [134]:
pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.119462,0.988917,0.053551,,,
2,0.823311,0.367955,0.625359,,,
3,0.308515,0.116837,0.143324,,,
4,,,,0.221465,0.552405,0.411732
5,,,,0.17467,0.108201,0.542668
6,,,,0.575452,0.106296,0.389967


### Combining

In [135]:
ser1 = pd.Series(np.random.rand(5), index=[1,2,3,4,5])
ser1

1    0.453183
2    0.986568
3    0.128485
4    0.625676
5    0.977553
dtype: float64

In [136]:
ser2 = pd.Series(np.random.rand(4), index=[2,4,5,6])
ser2

2    0.057887
4    0.924813
5    0.981416
6    0.691673
dtype: float64

In [137]:
ser1.combine_first(ser2)

1    0.453183
2    0.986568
3    0.128485
4    0.625676
5    0.977553
6    0.691673
dtype: float64

In [138]:
ser2.combine_first(ser1)

1    0.453183
2    0.057887
3    0.128485
4    0.924813
5    0.981416
6    0.691673
dtype: float64

In [139]:
ser1[:3].combine_first(ser2[:3])

1    0.453183
2    0.986568
3    0.128485
4    0.924813
5    0.981416
dtype: float64

## Pivoting

### Pivoting with Hierarchical Indexing

In [140]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                     index=['white','black','red'],
                     columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [141]:
ser = frame1.stack()
ser

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int64

In [142]:
ser.unstack()

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [143]:
ser.unstack(0)

Unnamed: 0,white,black,red
ball,0,3,6
pen,1,4,7
pencil,2,5,8


### Pivoting from "Long" to "Wide" Format

In [144]:
longframe = pd.DataFrame({ 'color':['white','white','white',
                                    'red','red','red',
                                    'black','black','black'],
                           'item':['ball','pen','mug',
                                   'ball','pen','mug',
                                   'ball','pen','mug'],
                           'value': np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.083058
1,white,pen,0.600285
2,white,mug,0.477206
3,red,ball,0.575584
4,red,pen,0.766005
5,red,mug,0.892188
6,black,ball,0.754407
7,black,pen,0.180867
8,black,mug,0.184051


In [145]:
wideframe = longframe.pivot('color','item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.754407,0.184051,0.180867
red,0.575584,0.892188,0.766005
white,0.083058,0.477206,0.600285


### Removing

In [146]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                       index=['white','black','red'],
                       columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [147]:
del frame1['ball']
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [148]:
frame1.drop('white')

Unnamed: 0,pen,pencil
black,4,5
red,7,8


## Data Transformation

### Removing Duplicates

In [149]:
dframe = pd.DataFrame({ 'color': ['white','white','red','red','white'],
                        'value': [2,1,3,3,2]})
dframe

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [150]:
dframe.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [151]:
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


### Replacing Values via Mapping

In [152]:
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                       'color':['white','rosso','verde','black','yellow'],
                       'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,rosso,4.2
2,pen,verde,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [153]:
newcolors = {
    'rosso': 'red',
    'verde': 'green'
}

In [154]:
frame.replace(newcolors)

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [155]:
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [156]:
ser.replace(np.nan,0)

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

### Adding Values via Mapping

In [157]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,item,color
0,ball,white
1,mug,red
2,pen,green
3,pencil,black
4,ashtray,yellow


In [158]:
price = {
    'ball': 5.56,
    'mug': 4.20,
    'bottle': 1.30,
    'scissors': 3.41,
    'pen': 1.30,
    'pencil': 0.56,
    'ashtray': 2.75
}

In [159]:
frame['price'] = frame['item'].map(price)
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


### Rename the Indexes of the Axes

In [160]:
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [161]:
reindex = {
    0: 'first',
    1: 'second',
    2: 'third',
    3: 'fourth',
    4: 'fifth'
}
frame.rename(reindex)

Unnamed: 0,item,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [162]:
recolumn = {
    'item': 'object',
    'prince': 'value'
}
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,object,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.3
fourth,pencil,black,0.56
fifth,ashtray,yellow,2.75


In [163]:
frame.rename(index={1:'first'}, columns={'item':'object'})

Unnamed: 0,object,color,price
0,ball,white,5.56
first,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


In [164]:
frame.rename(columns={'item':'object'}, inplace=True)

In [165]:
frame

Unnamed: 0,object,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.3
3,pencil,black,0.56
4,ashtray,yellow,2.75


## Discretization and Binning

In [166]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]
bins = [0,25,50,75,100]
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [167]:
cat.categories

IntervalIndex([(0, 25], (25, 50], (50, 75], (75, 100]],
              closed='right',
              dtype='interval[int64]')

In [168]:
cat.codes

array([0, 1, 2, 2, 1, 3, 3, 0, 0, 2, 2, 1, 3, 0, 1, 3, 3], dtype=int8)

In [169]:
pd.value_counts(cat)

(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [170]:
bin_names = ['unlikely','less likely','likely','highly likely']
pd.cut(results, bins, labels=bin_names)

[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less likely < likely < highly likely]

In [171]:
pd.cut(results, 5)

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]
Length: 17
Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] < (79.8, 99.0]]

In [172]:
quintiles = pd.qcut(results, 5)
quintiles

[(2.999, 24.0], (24.0, 46.0], (62.6, 87.0], (46.0, 62.6], (24.0, 46.0], ..., (62.6, 87.0], (2.999, 24.0], (46.0, 62.6], (87.0, 99.0], (62.6, 87.0]]
Length: 17
Categories (5, interval[float64]): [(2.999, 24.0] < (24.0, 46.0] < (46.0, 62.6] < (62.6, 87.0] < (87.0, 99.0]]

In [173]:
pd.value_counts(quintiles)

(62.6, 87.0]     4
(2.999, 24.0]    4
(87.0, 99.0]     3
(46.0, 62.6]     3
(24.0, 46.0]     3
dtype: int64

### Detecting and Filtering Outliers

In [174]:
randframe = pd.DataFrame(np.random.randn(1000,3))
randframe.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,-0.019632,-0.063224,-0.054222
std,1.019545,0.972387,1.002894
min,-3.891435,-3.169112,-2.655789
25%,-0.745428,-0.745926,-0.745387
50%,-0.015208,-0.084051,-0.06812
75%,0.673257,0.608669,0.636319
max,3.654232,2.97101,3.753107


In [175]:
randframe.std()

0    1.019545
1    0.972387
2    1.002894
dtype: float64

In [176]:
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

Unnamed: 0,0,1,2
79,-0.88824,-3.169112,0.714598
143,3.191993,1.002597,0.823127
150,-1.32056,2.97101,-2.242545
169,3.654232,-0.852494,-1.619197
663,1.032213,0.69613,3.753107
698,-3.321618,1.022313,-0.780337
938,-3.891435,1.397212,-0.420827


## Permutation

In [177]:
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
nframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [178]:
new_order = np.random.permutation(5)
new_order

array([2, 1, 3, 0, 4])

In [179]:
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
2,10,11,12,13,14
1,5,6,7,8,9
3,15,16,17,18,19
0,0,1,2,3,4
4,20,21,22,23,24


In [180]:
new_order = [3,4,2]
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


### Random Sampling

In [181]:
sample = np.random.randint(0, len(nframe), size=3)
sample

array([4, 0, 1])

In [182]:
nframe.take(sample)

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
0,0,1,2,3,4
1,5,6,7,8,9


## String Manipulation

### Built-in Methods for Manipulation of Strings

In [183]:
text = '16 Bolton Avenue , Boston'
text.split(',')

['16 Bolton Avenue ', ' Boston']

In [184]:
tokens = [s.strip() for s in text.split(',')]
tokens

['16 Bolton Avenue', 'Boston']

In [185]:
address, city = [s.strip() for s in text.split(',')]
address

'16 Bolton Avenue'

In [186]:
city

'Boston'

In [187]:
address + ',' + city

'16 Bolton Avenue,Boston'

In [188]:
strings = ['A+', 'A', 'A-', 'B', 'BB', 'BBB', 'C+']
';'.join(strings)

'A+;A;A-;B;BB;BBB;C+'

In [189]:
'Boston' in text

True

In [190]:
text.index('Boston')

19

In [191]:
text.find('Boston')

19

In [192]:
text.find('New York')

-1

In [193]:
text.count('e')

2

In [194]:
text.count('Avenue')

1

In [195]:
text.replace('Avenue','Street')

'16 Bolton Street , Boston'

In [196]:
text.replace('1','')

'6 Bolton Avenue , Boston'

### Regular Expressions

In [197]:
import re

In [198]:
text = "This is      an\t odd   \n text!"
re.split('\s+', text)

['This', 'is', 'an', 'odd', 'text!']

In [199]:
regex = re.compile('\s+')

In [200]:
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [201]:
text = 'This is my address: 16 Bolton Avenue, Boston'
re.findall('A\w+', text)

['Avenue']

In [202]:
re.findall('[A,a]\w+', text)

['address', 'Avenue']

In [203]:
re.search('[A,a]\w+', text)

<re.Match object; span=(11, 18), match='address'>

In [204]:
search = re.search('[A,a]\w+', text)
search.start()

11

In [205]:
search.end()

18

In [206]:
text[search.start():search.end()]

'address'

In [207]:
re.match('[A,a]\w+', text)

In [208]:
re.match('T\w+', text)

<re.Match object; span=(0, 4), match='This'>

In [209]:
match = re.match('T\w+', text)
text[match.start():match.end()]

'This'

## Data Aggregation

### A Practical Example

In [210]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'object': ['pen','pencil','pencil','ashtray','pen'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [211]:
group = frame['price1'].groupby(frame['color'])
group

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fca57f8fb70>

In [212]:
group.groups

{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [213]:
group.mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [214]:
group.sum()

color
green    4.05
red      4.76
white    5.56
Name: price1, dtype: float64

### Hierarchical Grouping

In [215]:
ggroup = frame['price1'].groupby([frame['color'],frame['object']])
ggroup.groups

{('green', 'pen'): Int64Index([4], dtype='int64'),
 ('green', 'pencil'): Int64Index([2], dtype='int64'),
 ('red', 'ashtray'): Int64Index([3], dtype='int64'),
 ('red', 'pencil'): Int64Index([1], dtype='int64'),
 ('white', 'pen'): Int64Index([0], dtype='int64')}

In [216]:
ggroup.sum()

color  object 
green  pen        2.75
       pencil     1.30
red    ashtray    0.56
       pencil     4.20
white  pen        5.56
Name: price1, dtype: float64

In [217]:
frame[['price1','price2']].groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [218]:
frame.groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


## Group Iteration

In [219]:
for name, group in frame.groupby('color'):
    print(name)
    print(group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen    5.56    4.75


### Chain of Transformations

In [220]:
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

pandas.core.series.Series

In [221]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

pandas.core.frame.DataFrame

In [222]:
frame['price1'].groupby(frame['color']).mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [223]:
frame.groupby(frame['color'])['price1'].mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [224]:
(frame.groupby(frame['color']).mean())['price1']

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [225]:
means = frame.groupby('color').mean().add_prefix('mean_')
means

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


### Functions on Groups

In [226]:
group = frame.groupby('color')
group['price1'].quantile(0.6)

color
green    2.170
red      2.744
white    5.560
Name: price1, dtype: float64

In [227]:
def range(series):
    return series.max() - series.min()
group['price1'].agg(range)

color
green    1.45
red      3.64
white    0.00
Name: price1, dtype: float64

In [228]:
group.agg(range)

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.45,1.55
red,3.64,3.37
white,0.0,0.0


In [229]:
group['price1'].agg(['mean','std',range])

Unnamed: 0_level_0,mean,std,range
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,2.025,1.025305,1.45
red,2.38,2.573869,3.64
white,5.56,,0.0


## Advanced Data Aggregation

In [230]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                       'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
                       'price2': [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,price1,price2
0,white,5.56,4.75
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [231]:
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

Unnamed: 0_level_0,tot_price1,tot_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,4.05,4.75
red,4.76,4.87
white,5.56,4.75


In [232]:
pd.merge(frame, sums, left_on='color', right_index=True)

Unnamed: 0,color,price1,price2,tot_price1,tot_price2
0,white,5.56,4.75,5.56,4.75
1,red,4.2,4.12,4.76,4.87
3,red,0.56,0.75,4.76,4.87
2,green,1.3,1.6,4.05,4.75
4,green,2.75,3.15,4.05,4.75


In [233]:
frame.groupby('color').transform(np.sum).add_prefix('tot_')

Unnamed: 0,tot_price1,tot_price2
0,5.56,4.75
1,4.76,4.87
2,4.05,4.75
3,4.76,4.87
4,4.05,4.75


In [234]:
frame = pd.DataFrame({ 'color': ['white','black','white','white','black','black'],
                       'status': ['up','up','down','down','down','up'],
                       'price1': [12.33,14.55,22.34,27.84,23.40,18.33],
                       'price2': [11.23,31.80,29.99,31.18,18.25,22.44]})
frame

Unnamed: 0,color,status,price1,price2
0,white,up,12.33,11.23
1,black,up,14.55,31.8
2,white,down,22.34,29.99
3,white,down,27.84,31.18
4,black,down,23.4,18.25
5,black,up,18.33,22.44


In [235]:
frame.groupby(['color','status']).apply( lambda x: x.max())

Unnamed: 0_level_0,Unnamed: 1_level_0,color,status,price1,price2
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
black,down,black,down,23.4,18.25
black,up,black,up,18.33,31.8
white,down,white,down,27.84,31.18
white,up,white,up,12.33,11.23


In [236]:
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,color,status,price1,price2
first,white,up,12.33,11.23
second,black,up,14.55,31.8
third,white,down,22.34,29.99
fourth,white,down,27.84,31.18
fifth,black,down,23.4,18.25
5,black,up,18.33,22.44


In [237]:
temp = pd.date_range('1/1/2015', periods=10, freq= 'H')
temp

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [238]:
timeseries = pd.Series(np.random.rand(10), index=temp)
timeseries

2015-01-01 00:00:00    0.903557
2015-01-01 01:00:00    0.605940
2015-01-01 02:00:00    0.087463
2015-01-01 03:00:00    0.876693
2015-01-01 04:00:00    0.755643
2015-01-01 05:00:00    0.786863
2015-01-01 06:00:00    0.236877
2015-01-01 07:00:00    0.581383
2015-01-01 08:00:00    0.047445
2015-01-01 09:00:00    0.795947
Freq: H, dtype: float64

In [239]:
timetable = pd.DataFrame( {'date': temp, 'value1': np.random.rand(10),
                                      'value2': np.random.rand(10)})
timetable

Unnamed: 0,date,value1,value2
0,2015-01-01 00:00:00,0.152978,0.113088
1,2015-01-01 01:00:00,0.48482,0.363529
2,2015-01-01 02:00:00,0.404091,0.63235
3,2015-01-01 03:00:00,0.426101,0.63025
4,2015-01-01 04:00:00,0.659161,0.469661
5,2015-01-01 05:00:00,0.568437,0.896988
6,2015-01-01 06:00:00,0.105109,0.627511
7,2015-01-01 07:00:00,0.781102,0.448832
8,2015-01-01 08:00:00,0.428939,0.80369
9,2015-01-01 09:00:00,0.382358,0.338802


In [240]:
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable

Unnamed: 0,date,value1,value2,cat
0,2015-01-01 00:00:00,0.152978,0.113088,up
1,2015-01-01 01:00:00,0.48482,0.363529,down
2,2015-01-01 02:00:00,0.404091,0.63235,left
3,2015-01-01 03:00:00,0.426101,0.63025,left
4,2015-01-01 04:00:00,0.659161,0.469661,up
5,2015-01-01 05:00:00,0.568437,0.896988,up
6,2015-01-01 06:00:00,0.105109,0.627511,down
7,2015-01-01 07:00:00,0.781102,0.448832,right
8,2015-01-01 08:00:00,0.428939,0.80369,right
9,2015-01-01 09:00:00,0.382358,0.338802,up
