# Aggregations, Joins, Subqueries in Pandas
PANDAS COMPARISON WITH SQL: https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html

__Groupby__ means:
* _Split_ the original object into sets
* _Apply_ function on each subset
* _Combine_ the results

__Apply__ may do the following:
* _Aggregate_ − summary stats
* Group-specific _transformations_
* _Filter_ on condition

In [20]:
import numpy as np
import pandas as pd

## 1. Aggregations

### 1a. Aggregations on entire dataframe
__WINDOW FUNCTIONS__ - used in find trends in data graphically by smoothing the curve (if a lot of data)
* __df.rolling()__ - rolling window calculations; __window__=window size, __min_periods__=min num observations in window required to have a value.
* __df.expanding()__ - same as rolling, but uses all the data up to that point in time. These two statements are equivalent: [df.rolling(window=len(df), min_periods=1).mean()] = [df.expanding(min_periods=1).mean()]
* __df.ewm()__ - exponentially weighted window similar to expanding window, but each prior point is exponentially weighted down relative to the current point

In [69]:
ipl_data = {  'Team':   [ 1, 1, 2, 2, 3, 4, 3, 3, 1, 5, 5, 1 ],
              'Rank':   [ 1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2 ],
              'Year':   [ 2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017 ],
              'Points': [ 876,789,863,673,741,812,756,788,694,701,804,690 ]}
df = pd.DataFrame(ipl_data)
df

Unnamed: 0,Team,Rank,Year,Points
0,1,1,2014,876
1,1,2,2015,789
2,2,2,2014,863
3,2,3,2015,673
4,3,3,2014,741
5,4,4,2015,812
6,3,1,2016,756
7,3,1,2017,788
8,1,2,2016,694
9,5,4,2014,701


In [72]:
# Apply Aggregation on a Whole Dataframe
r = df.rolling(window=3, min_periods=1)
r.agg(np.sum)

Unnamed: 0,Team,Rank,Year,Points
0,1.0,1.0,2014.0,876.0
1,2.0,3.0,4029.0,1665.0
2,4.0,5.0,6043.0,2528.0
3,5.0,7.0,6044.0,2325.0
4,7.0,8.0,6043.0,2277.0
5,9.0,10.0,6044.0,2226.0
6,10.0,8.0,6045.0,2309.0
7,10.0,6.0,6048.0,2356.0
8,7.0,4.0,6049.0,2238.0
9,9.0,7.0,6047.0,2183.0


In [70]:
# Aggregation on a Single Column
r = df.rolling(window=3,min_periods=1)
r['Points'].agg(np.sum)

0      876.0
1     1665.0
2     2528.0
3     2325.0
4     2277.0
5     2226.0
6     2309.0
7     2356.0
8     2238.0
9     2183.0
10    2199.0
11    2195.0
Name: Points, dtype: float64

In [73]:
# Aggregation on Multiple Columns
r = df.rolling(window=3,min_periods=1)
r[['Points', 'Rank']].agg(np.sum)

Unnamed: 0,Points,Rank
0,876.0,1.0
1,1665.0,3.0
2,2528.0,5.0
3,2325.0,7.0
4,2277.0,8.0
5,2226.0,10.0
6,2309.0,8.0
7,2356.0,6.0
8,2238.0,4.0
9,2183.0,7.0


In [74]:
# Multiple Functions on a Single Column
r = df.rolling(window=3,min_periods=1)
r['Points'].agg([np.sum,np.mean])

Unnamed: 0,sum,mean
0,876.0,876.0
1,1665.0,832.5
2,2528.0,842.666667
3,2325.0,775.0
4,2277.0,759.0
5,2226.0,742.0
6,2309.0,769.666667
7,2356.0,785.333333
8,2238.0,746.0
9,2183.0,727.666667


In [75]:
# Multiple Functions on Multiple Columns
r = df.rolling(window=3,min_periods=1)
r[['Points', 'Rank']].aggregate([np.sum,np.mean])

Unnamed: 0_level_0,Points,Points,Rank,Rank
Unnamed: 0_level_1,sum,mean,sum,mean
0,876.0,876.0,1.0,1.0
1,1665.0,832.5,3.0,1.5
2,2528.0,842.666667,5.0,1.666667
3,2325.0,775.0,7.0,2.333333
4,2277.0,759.0,8.0,2.666667
5,2226.0,742.0,10.0,3.333333
6,2309.0,769.666667,8.0,2.666667
7,2356.0,785.333333,6.0,2.0
8,2238.0,746.0,4.0,1.333333
9,2183.0,727.666667,7.0,2.333333


In [76]:
# Different Functions to Different Columns
r = df.rolling(window=3,min_periods=1)
r.aggregate({'Points' : np.sum,'Rank' : np.mean})

Unnamed: 0,Points,Rank
0,876.0,1.0
1,1665.0,1.5
2,2528.0,1.666667
3,2325.0,2.333333
4,2277.0,2.666667
5,2226.0,3.333333
6,2309.0,2.666667
7,2356.0,2.0
8,2238.0,1.333333
9,2183.0,2.333333


### 1b. Aggregations with Groupby()

In [29]:
ipl_data = { 'Team': [ 'Riders', 'Riders', 'Angels', 'Angels', 'Kings', 'kings', 'Kings', 'Kings', 'Riders',
                       'Royals', 'Royals', 'Riders' ],
              'Rank': [ 1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2 ],
              'Year': [ 2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017 ],
              'Points': [ 876,789,863,673,741,812,756,788,694,701,804,690 ]}
df = pd.DataFrame(ipl_data)
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Angels,2,2014,863
3,Angels,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


### Groupby() returns groups

In [13]:
# ONE COLUMN
print( df.groupby('Team'), '\n' )
print( df.groupby('Team').groups, '\n' )

# SEVERAL COLUMNS
print(df.groupby(['Team','Year']).groups, '\n')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe56030d2d0> 

{'Devils': [2, 3], 'Kings': [4, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10], 'kings': [5]} 

{('Devils', 2014): [2], ('Devils', 2015): [3], ('Kings', 2014): [4], ('Kings', 2016): [6], ('Kings', 2017): [7], ('Riders', 2014): [0], ('Riders', 2015): [1], ('Riders', 2016): [8], ('Riders', 2017): [11], ('Royals', 2014): [9], ('Royals', 2015): [10], ('kings', 2015): [5]} 



In [49]:
# COMPOSITION OF GROUPED OBJECT
grouped = df.groupby('Year')
for group in grouped:
    print( type(group), '\n',          # each group is tuple
           type(group[0]), '\n',       # first elem: name as str
           type(group[1]), '\n',       # second elem: group as df
           len(group), '\n',
           group, '\n', sep='')

<class 'tuple'>
<class 'int'>
<class 'pandas.core.frame.DataFrame'>
2
(2014,      Team  Rank  Year  Points
0  Riders     1  2014     876
2  Angels     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701)

<class 'tuple'>
<class 'int'>
<class 'pandas.core.frame.DataFrame'>
2
(2015,       Team  Rank  Year  Points
1   Riders     2  2015     789
3   Angels     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804)

<class 'tuple'>
<class 'int'>
<class 'pandas.core.frame.DataFrame'>
2
(2016,      Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694)

<class 'tuple'>
<class 'int'>
<class 'pandas.core.frame.DataFrame'>
2
(2017,       Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690)



In [31]:
# ITERATE OVER GROUPS
grouped = df.groupby('Year')

for name, group in grouped:
    print( name )              # str
    print( group, '\n' )       # df

2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Angels     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701 

2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Angels     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804 

2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694 

2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690 



In [32]:
temp = grouped.get_group(2015)
print( type(temp) )
print( temp )

<class 'pandas.core.frame.DataFrame'>
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Angels     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804


In [18]:
# SAME, SHORTER
df[ df['Year']==2015 ]

Unnamed: 0,Team,Rank,Year,Points
1,Riders,2,2015,789
3,Devils,3,2015,673
5,kings,4,2015,812
10,Royals,1,2015,804


### Aggregation: df.groupby().agg(), aggregated stats by group
__Several aggregation operations__ can be performed on the grouped data

In [52]:
# agg() on all columns in each df from grouped
grouped = df.groupby('Team')
grouped.agg(np.size)

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angels,2,2,2
Kings,3,3,3
Riders,4,4,4
Royals,2,2,2
kings,1,1,1


In [53]:
# agg() on one column in each df from grouped
grouped = df.groupby('Team')
grouped['Points'].agg(np.mean)

Team
Angels    768.000000
Kings     761.666667
Riders    762.250000
Royals    752.500000
kings     812.000000
Name: Points, dtype: float64

In [55]:
# MULTIPLE AGGREGATION FUNCTIONS ON ONE COL
grouped = df.groupby('Team')
grouped['Points'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angels,1536,768.0,134.350288
Kings,2285,761.666667,24.006943
Riders,3049,762.25,88.567771
Royals,1505,752.5,72.831998
kings,812,812.0,


In [57]:
# MULTIPLE AGGREGATION FUNCTIONS ON MANY COLs
grouped = df.groupby('Team')
grouped[['Points', 'Rank']].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,Points,Points,Points,Rank,Rank,Rank
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
Team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Angels,1536,768.0,134.350288,5,2.5,0.707107
Kings,2285,761.666667,24.006943,5,1.666667,1.154701
Riders,3049,762.25,88.567771,7,1.75,0.5
Royals,1505,752.5,72.831998,5,2.5,2.12132
kings,812,812.0,,4,4.0,


### Transformation: df.groupby().transform(), returns same data size as original df
Applied to __group__ or __column__, returns an obj __w/same index size__

In [35]:
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Angels,2,2014,863
3,Angels,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


In [36]:
grouped = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
print( grouped.transform(score) )

         Rank       Year     Points
0  -15.000000 -11.618950  12.843272
1    5.000000  -3.872983   3.020286
2   -7.071068  -7.071068   7.071068
3    7.071068   7.071068  -7.071068
4   11.547005 -10.910895  -8.608621
5         NaN        NaN        NaN
6   -5.773503   2.182179  -2.360428
7   -5.773503   8.728716  10.969049
8    5.000000   3.872983  -7.705963
9    7.071068  -7.071068  -7.071068
10  -7.071068   7.071068   7.071068
11   5.000000  11.618950  -8.157595


### Filtration: df.groupby().filter(), returns subset of df
Filter data on criteria, return subset

In [47]:
grouped = df.groupby('Team')
for group in grouped:
    print( type(group), '\n',          # each group is tuple
           type(group[0]), '\n',       # first elem: name as str
           type(group[1]), '\n',       # second elem: group as df
           len(group), '\n',
           group, '\n', sep='')

<class 'tuple'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
2
('Angels',      Team  Rank  Year  Points
2  Angels     2  2014     863
3  Angels     3  2015     673)

<class 'tuple'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
2
('Kings',     Team  Rank  Year  Points
4  Kings     3  2014     741
6  Kings     1  2016     756
7  Kings     1  2017     788)

<class 'tuple'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
2
('Riders',       Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
8   Riders     2  2016     694
11  Riders     2  2017     690)

<class 'tuple'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
2
('Royals',       Team  Rank  Year  Points
9   Royals     4  2014     701
10  Royals     1  2015     804)

<class 'tuple'>
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
2
('kings',     Team  Rank  Year  Points
5  kings     4  2015     812)



In [58]:
# RETURN TEAMS THAT PARTICIPATED 3 TIMES OR MORE - applied to group (df), and not name (str) when iteriting groupby()
df.groupby('Team').filter(lambda x: len(x) >= 3)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
4,Kings,3,2014,741
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
11,Riders,2,2017,690


## 2. Merges

In [None]:
DataFrame.merge( right_df,
                 how='inner',               # default ‘inner’, {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}
                 on=None,                   # join key(s)
                 left_on=None,              # if join key(s) have different names in the two dfs (but are same)
                 right_on=None,
                 left_index=False,          # use index as join key(s)
                 right_index=False, 
                 sort=False,                # Sort join keys lexicographically
                 suffixes=('_x', '_y'), 
                 copy=True,
                 indicator=False,           # col _merge saying source of each row
                 validate=None,             # check merge keys if they are “1:1”, “1:m”, “m:1”, “m:m”
               )

In [83]:
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

In [84]:
# merge performs an INNER JOIN by default
pd.merge(df1, df2, on="key")

Unnamed: 0,key,value_x,value_y
0,B,0.934074,-0.074744
1,D,0.870659,-0.139245
2,D,0.870659,-0.418643


In [85]:
# merge none DataFrame’s column with another DataFrame’s index
indexed_df2 = df2.set_index("key")
pd.merge(df1, indexed_df2, left_on="key", right_index=True)

Unnamed: 0,key,value_x,value_y
1,B,0.934074,-0.074744
3,D,0.870659,-0.139245
3,D,0.870659,-0.418643


In [86]:
pd.merge(df1, df2, on="key", how="left")

Unnamed: 0,key,value_x,value_y
0,A,-0.679279,
1,B,0.934074,-0.074744
2,C,0.064813,
3,D,0.870659,-0.139245
4,D,0.870659,-0.418643


In [87]:
pd.merge(df1, df2, on="key", how="right")


Unnamed: 0,key,value_x,value_y
0,B,0.934074,-0.074744
1,D,0.870659,-0.139245
2,D,0.870659,-0.418643
3,E,,0.860307


In [88]:
pd.merge(df1, df2, on="key", how="outer")

Unnamed: 0,key,value_x,value_y
0,A,-0.679279,
1,B,0.934074,-0.074744
2,C,0.064813,
3,D,0.870659,-0.139245
4,D,0.870659,-0.418643
5,E,,0.860307


## 3. Subquery

In [77]:
df = pd.DataFrame({ 'Person': ['Adam', 'Adam', 'Cesar', 'Diana', 'Diana', 'Diana', 'Erika', 'Erika'],
                    'Belonging': ['House', 'Car', 'Car', 'House', 'Car', 'Bike', 'House', 'Car'],
                    'Value': [300, 10, 12, 450, 15, 2, 600, 11],
                    })
df

Unnamed: 0,Person,Belonging,Value
0,Adam,House,300
1,Adam,Car,10
2,Cesar,Car,12
3,Diana,House,450
4,Diana,Car,15
5,Diana,Bike,2
6,Erika,House,600
7,Erika,Car,11


__Task: find value of people's car, if their house's value > 400__

In [78]:
# SUBQUERY IN SQL
'''
SELECT * 
FROM df 
WHERE person IN 
    (SELECT person 
        FROM df 
        WHERE belonging='House' AND value>400)
AND belonging='Car';

person      belonging   value     
----------  ----------  ----------
Diana       Car         15        
Erika       Car         11           
'''
pass

In [81]:
persons = df[(df['Belonging'] == 'House') & (df['Value'] > 400)]['Person'].values
df[ (df['Person'].isin(persons)) & (df['Belonging'] == 'Car') ]

Unnamed: 0,Person,Belonging,Value
4,Diana,Car,15
7,Erika,Car,11
