# Merge

df.merge nos permite combinar dataframes haciendo los joins típicos de bases de datos.

In [3]:
import pandas as pd

In [4]:
df1 =  pd.DataFrame({'data1' : range(5), 'key': list('abcde')})
df1

Unnamed: 0,data1,key
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [5]:
df2 = pd.DataFrame({'data2': range(7,10), 'key': list('ade')
})
df2

Unnamed: 0,data2,key
0,7,a
1,8,d
2,9,e


Por defecto, un inner join

In [6]:
df1.merge(df2)

Unnamed: 0,data1,key,data2
0,0,a,7
1,3,d,8
2,4,e,9


In [7]:
df1.merge(df2, how='left')

Unnamed: 0,data1,key,data2
0,0,a,7.0
1,1,b,
2,2,c,
3,3,d,8.0
4,4,e,9.0


In [6]:
df3 = pd.DataFrame({'data2': range(7,10), 'key': list('ada')
})
df3

Unnamed: 0,data2,key
0,7,a
1,8,d
2,9,a


In [7]:
df1.merge(df3)

Unnamed: 0,data1,key,data2
0,0,a,7
1,0,a,9
2,3,d,8


In [8]:
df4 = pd.DataFrame({'rkey': list('ddeebbfff'), 'values': range(4, 40, 4)})
df4

Unnamed: 0,rkey,values
0,d,4
1,d,8
2,e,12
3,e,16
4,b,20
5,b,24
6,f,28
7,f,32
8,f,36


Outer join conservará todas las filas, tanto de la izquierda como de la derecha.

In [10]:
df1.merge(df4, left_on='key', right_on='rkey', how='outer')

Unnamed: 0,data1,key,rkey,values
0,0.0,a,,
1,1.0,b,b,20.0
2,1.0,b,b,24.0
3,2.0,c,,
4,3.0,d,d,4.0
5,3.0,d,d,8.0
6,4.0,e,e,12.0
7,4.0,e,e,16.0
8,,,f,28.0
9,,,f,32.0


Si tenemos dos columnas con el mismo nombre sobre las que no cruzamos, pandas les asignará sufijos.

In [11]:
df1['X'] = 42.0

In [12]:
df4['X'] = 37.0

In [13]:
df1.merge(df4, left_on='key', right_on='rkey', how='outer')

Unnamed: 0,data1,key,X_x,rkey,values,X_y
0,0.0,a,42.0,,,
1,1.0,b,42.0,b,20.0,37.0
2,1.0,b,42.0,b,24.0,37.0
3,2.0,c,42.0,,,
4,3.0,d,42.0,d,4.0,37.0
5,3.0,d,42.0,d,8.0,37.0
6,4.0,e,42.0,e,12.0,37.0
7,4.0,e,42.0,e,16.0,37.0
8,,,,f,28.0,37.0
9,,,,f,32.0,37.0


In [14]:
df1.merge(df4, left_on='key', right_on='rkey', how='outer', suffixes=['_left', '_right'])

Unnamed: 0,data1,key,X_left,rkey,values,X_right
0,0.0,a,42.0,,,
1,1.0,b,42.0,b,20.0,37.0
2,1.0,b,42.0,b,24.0,37.0
3,2.0,c,42.0,,,
4,3.0,d,42.0,d,4.0,37.0
5,3.0,d,42.0,d,8.0,37.0
6,4.0,e,42.0,e,12.0,37.0
7,4.0,e,42.0,e,16.0,37.0
8,,,,f,28.0,37.0
9,,,,f,32.0,37.0


In [15]:
df1.merge(df2, left_index=True, right_on='data2', how='outer')

Unnamed: 0,data1,key_x,X,data2,key_y
2,0.0,a,42.0,0,
2,1.0,b,42.0,1,
2,2.0,c,42.0,2,
2,3.0,d,42.0,3,
2,4.0,e,42.0,4,
0,,,,7,a
1,,,,8,d
2,,,,9,e


## Concatenating along an axis

By default, side to side. The difference with a numpy array is that pandas will try to align the rows of a DataFrame according to their index values, generating NaNs if necessary.

In [16]:
df1

Unnamed: 0,data1,key,X
0,0,a,42.0
1,1,b,42.0
2,2,c,42.0
3,3,d,42.0
4,4,e,42.0


In [17]:
df2

Unnamed: 0,data2,key
0,7,a
1,8,d
2,9,e


In [18]:
pd.concat([df1, df2])

Unnamed: 0,X,data1,data2,key
0,42.0,0.0,,a
1,42.0,1.0,,b
2,42.0,2.0,,c
3,42.0,3.0,,d
4,42.0,4.0,,e
0,,,7.0,a
1,,,8.0,d
2,,,9.0,e


In [19]:
import numpy as np


In [20]:
array_1 = np.arange(36).reshape(6,6)
array_1

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

In [21]:
array_2 = np.arange(18).reshape(3,6)
array_3 = np.arange(18).reshape(6,3)

In [22]:
np.concatenate([array_1, array_2])

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35],
       [ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17]])

In numpy, arrays must have matching dimensions

In [23]:
np.concatenate([array_1, array_2], axis=1)

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [24]:
np.concatenate([array_1, array_3], axis=1)

array([[ 0,  1,  2,  3,  4,  5,  0,  1,  2],
       [ 6,  7,  8,  9, 10, 11,  3,  4,  5],
       [12, 13, 14, 15, 16, 17,  6,  7,  8],
       [18, 19, 20, 21, 22, 23,  9, 10, 11],
       [24, 25, 26, 27, 28, 29, 12, 13, 14],
       [30, 31, 32, 33, 34, 35, 15, 16, 17]])

In [25]:
series_1 = df1['data1']

In [26]:
series_2 = df2['data2']

In [27]:
series_3 = pd.Series(range(3,8,2))

Series will concatenated end to end by default

In [28]:
pd.concat([series_1, series_2, series_3])

0    0
1    1
2    2
3    3
4    4
0    7
1    8
2    9
0    3
1    5
2    7
dtype: int64

In [29]:
pd.concat([series_1, series_2, series_3], axis=1)

Unnamed: 0,data1,data2,0
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


In [30]:
pd.concat([series_1, series_2, series_3], axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


In [31]:
pd.concat([series_1, series_2, series_3], axis=1,keys=['s1', 's2', 's3'] )

Unnamed: 0,s1,s2,s3
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


#### Remove duplicates

In [32]:
df5 = pd.DataFrame({'key1': ['one'] * 3 + ['two'] * 4,
                   'key2': [1,1,2, 3,3,4,4,]})

In [33]:
df5

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [34]:
df5.drop_duplicates()

Unnamed: 0,key1,key2
0,one,1
2,one,2
3,two,3
5,two,4


In [35]:
df5

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [36]:
df5.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [37]:
df5.drop_duplicates(subset='key1')

Unnamed: 0,key1,key2
0,one,1
3,two,3


In [38]:
df5.drop_duplicates(keep='last')

Unnamed: 0,key1,key2
1,one,1
2,one,2
4,two,3
6,two,4


In [39]:
df5.drop_duplicates(subset='key1', keep='last')

Unnamed: 0,key1,key2
2,one,2
6,two,4


In [40]:
df5.drop_duplicates(subset='key1', keep='first')

Unnamed: 0,key1,key2
0,one,1
3,two,3


#### Renaming indexes 

In [41]:
df5.index[0] = 1

TypeError: Index does not support mutable operations

In [42]:
df5.index = list('abcdefg')
df5

Unnamed: 0,key1,key2
a,one,1
b,one,1
c,one,2
d,two,3
e,two,3
f,two,4
g,two,4


## Vectorized string operations in pandas

We can access vectorized string operations through the .str attribute of a string Series, such as a column in a dataframe. These operations mimic the classical string methods, but they operate on each element of the Series. We can also slice on .str.

In [43]:
a_string = 'muchas anios despues frente al peloton de fusilamiento...'

a_string.upper()

'MUCHAS ANIOS DESPUES FRENTE AL PELOTON DE FUSILAMIENTO...'

In [44]:
a_string[:6]

'muchas'

In [45]:
df5['animals'] = ['giraffe', 'nakedmolerat', 'bear', 'walrus', 'platypus', 'dog', 'cat']

df5

Unnamed: 0,key1,key2,animals
a,one,1,giraffe
b,one,1,nakedmolerat
c,one,2,bear
d,two,3,walrus
e,two,3,platypus
f,two,4,dog
g,two,4,cat


In [46]:
animals = df5['animals']

df5.dtypes

key1       object
key2        int64
animals    object
dtype: object

In [47]:
df5['key2'] ** 2

a     1
b     1
c     4
d     9
e     9
f    16
g    16
Name: key2, dtype: int64

In [48]:
df5['animals'][2:]

c        bear
d      walrus
e    platypus
f         dog
g         cat
Name: animals, dtype: object

In [49]:
animals.str.capitalize()

a         Giraffe
b    Nakedmolerat
c            Bear
d          Walrus
e        Platypus
f             Dog
g             Cat
Name: animals, dtype: object

In [50]:
animals.str.upper()

a         GIRAFFE
b    NAKEDMOLERAT
c            BEAR
d          WALRUS
e        PLATYPUS
f             DOG
g             CAT
Name: animals, dtype: object

In [51]:
animals.str.contains('a')

a     True
b     True
c     True
d     True
e     True
f    False
g     True
Name: animals, dtype: bool

In [52]:
animals.str.len()

a     7
b    12
c     4
d     6
e     8
f     3
g     3
Name: animals, dtype: int64

In [53]:
df5

Unnamed: 0,key1,key2,animals
a,one,1,giraffe
b,one,1,nakedmolerat
c,one,2,bear
d,two,3,walrus
e,two,3,platypus
f,two,4,dog
g,two,4,cat


In [54]:
df5[df5['animals'].str.len()>4]

Unnamed: 0,key1,key2,animals
a,one,1,giraffe
b,one,1,nakedmolerat
d,two,3,walrus
e,two,3,platypus


In [55]:
df5[animals.str.len()>4]

Unnamed: 0,key1,key2,animals
a,one,1,giraffe
b,one,1,nakedmolerat
d,two,3,walrus
e,two,3,platypus


In [56]:
series = pd.Series([True, True, False, False, False, True, True])

In [57]:
df5[series]

  """Entry point for launching an IPython kernel.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

# Group by 

Aggregation functions in pandas

In [10]:
df = pd.DataFrame({
    'data1': np.random.rand(5)*50+20,
    'data2': np.random.rand(5)*20+5,
    'key1' : ['one','two','one','two','one'],
    'key2' : list('aabba')
})
df


Unnamed: 0,data1,data2,key1,key2
0,42.147444,18.922851,one,a
1,31.48214,17.086809,two,a
2,27.218335,6.282102,one,b
3,63.449494,17.110029,two,b
4,40.821933,21.855211,one,a


In [11]:
gb = df.groupby('key1')

gb.sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,110.187712,47.060164
two,94.931634,34.196838


In [12]:
gb.mean()


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
one,36.729237,15.686721
two,47.465817,17.098419


In [13]:
df.groupby('key2').sum()
df.groupby('key2').std()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
a,5.812879,2.405114
b,25.619298,7.656501


In [14]:
df.groupby(['key1','key2']).sum() 

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,82.969377,40.778062
one,b,27.218335,6.282102
two,a,31.48214,17.086809
two,b,63.449494,17.110029


We can apply one transformation to groups defined by two columns

In [15]:
df.groupby(['key1','key2']).size() 

key1  key2
one   a       2
      b       1
two   a       1
      b       1
dtype: int64

### Iterate over groups


In [16]:
for name, group in df.groupby('key1'):
    print(name, group)

one        data1      data2 key1 key2
0  42.147444  18.922851  one    a
2  27.218335   6.282102  one    b
4  40.821933  21.855211  one    a
two        data1      data2 key1 key2
1  31.482140  17.086809  two    a
3  63.449494  17.110029  two    b


In [17]:
 !wget https://github.com/wesm/pydata-book/raw/1st-edition/ch08/tips.csv

--2018-05-29 11:27:13--  https://github.com/wesm/pydata-book/raw/1st-edition/ch08/tips.csv
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv [following]
--2018-05-29 11:27:13--  https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.132.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.132.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943 (7,8K) [text/plain]
Saving to: ‘tips.csv’


2018-05-29 11:27:14 (34,7 MB/s) - ‘tips.csv’ saved [7943/7943]



In [18]:
tips = pd.read_csv('tips.csv')
tips.shape

(244, 7)

In [19]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [60]:
# La media de propinas en función del género
tips.groupby('sex').mean()


Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,18.056897,2.833448,2.45977
Male,20.744076,3.089618,2.630573


In [20]:
tips.groupby('sex')['tip'].mean()

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [22]:
tips.groupby('sex').mean()['tip'] 

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64

In [23]:
#Calculamos el % de la propina
tips['tip_pct']= tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


We can also do several aggregations in one go.

In [64]:
tips.groupby(['sex'])['tip_pct'].agg(['mean','std'])

Unnamed: 0_level_0,mean,std
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.166491,0.053632
Male,0.157651,0.064778


#### Exercise

Calculate z-score for each tip_pct, relative to the distribution of tip_pct for each gender.

$$ z_i = \frac{X_i - \mu_X}{\sigma_X}$$

In [65]:
stats= tips.groupby(['sex'])['tip_pct'].agg(['mean','std'])
stats


Unnamed: 0_level_0,mean,std
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.166491,0.053632
Male,0.157651,0.064778


In [66]:
annotated=tips.merge(stats, left_on='sex', right_index=True)

In [67]:
annotated['z-score']= (annotated['tip_pct']- annotated['mean'])/ annotated['std']
annotated.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,mean,std,z-score
133,12.26,2.0,Female,No,Thur,Lunch,2,0.163132,0.166491,0.053632,-0.062623
9,14.78,3.23,Male,No,Sun,Dinner,2,0.218539,0.157651,0.064778,0.939951
96,27.28,4.0,Male,Yes,Fri,Dinner,2,0.146628,0.157651,0.064778,-0.170166
97,12.03,1.5,Male,Yes,Fri,Dinner,2,0.124688,0.157651,0.064778,-0.508851
102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.056433,0.166491,0.053632,-2.052094


If we need to apply different sets of aggregations to specific columns, we can pass a dictionary to .agg. In this dictionary, keys will be column names and values will be aggregations. They can be specified with one of the valid strings, or passing a function to apply to a whole Series.

In [68]:
functions_to_use_to_aggreate= {
    'tip': ['mean','std',np.sum],
    'total_bill': 'sum'}
functions_to_use_to_aggreate

{'tip': ['mean', 'std', <function numpy.core.fromnumeric.sum>],
 'total_bill': 'sum'}

In [70]:
multiple_aggregations=tips.groupby('smoker').agg(functions_to_use_to_aggreate)

In [71]:
multiple_aggregations

Unnamed: 0_level_0,tip,tip,tip,total_bill
Unnamed: 0_level_1,mean,std,sum,sum
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
No,2.991854,1.37719,451.77,2897.43
Yes,3.00871,1.401468,279.81,1930.34


In [72]:
multiple_aggregations['tip']['mean']

smoker
No     2.991854
Yes    3.008710
Name: mean, dtype: float64

In [73]:
gb= tips.groupby('smoker')
gb

<pandas.core.groupby.DataFrameGroupBy object at 0x7f530c318f60>

Group by objects also have an apply method, like Series. It processes each group with a function that takes a single argument: a DataFrame or Series. In this example, this function is top().

In [24]:
def top(df, n=2, column='tip'):
    return df.sort_values(by='tip')[-n:]

top(tips, n=10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
47,32.4,6.0,Male,No,Sun,Dinner,4,0.185185
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
214,28.17,6.5,Female,Yes,Sat,Dinner,3,0.230742
141,34.3,6.7,Male,No,Thur,Lunch,6,0.195335
59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


In [75]:
gb.apply(top) 

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


If we need to modify one of the default arguments of top(), we need to generate a new function that takes a single argument. We can do this with a lambda, an anonymous function. This is known as [Currying](https://en.wikipedia.org/wiki/Currying) in CS.

In [76]:
gb.apply(lambda df: top(df,n=5)) 

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,47,32.4,6.0,Male,No,Sun,Dinner,4,0.185185
No,141,34.3,6.7,Male,No,Thur,Lunch,6,0.195335
No,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,211,25.89,5.16,Male,Yes,Sat,Dinner,4,0.199305
Yes,181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177
Yes,214,28.17,6.5,Female,Yes,Sat,Dinner,3,0.230742
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812


Or, we can give apply the arguments it will have to use when it uses top

In [77]:
gb.apply(top, n=5, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,47,32.4,6.0,Male,No,Sun,Dinner,4,0.185185
No,141,34.3,6.7,Male,No,Thur,Lunch,6,0.195335
No,59,48.27,6.73,Male,No,Sat,Dinner,4,0.139424
No,23,39.42,7.58,Male,No,Sat,Dinner,4,0.192288
No,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
Yes,211,25.89,5.16,Male,Yes,Sat,Dinner,4,0.199305
Yes,181,23.33,5.65,Male,Yes,Sun,Dinner,2,0.242177
Yes,214,28.17,6.5,Female,Yes,Sat,Dinner,3,0.230742
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
