In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.158254,0.615733,0.453551,-0.254799
1,1.766605,-1.291921,0.546628,0.146044
2,-0.166655,-0.19646,-0.70472,-0.587469
3,1.167039,0.685617,-1.510551,0.709407
4,0.825522,0.431008,-0.272529,-1.750112
5,0.806591,-0.748442,2.422439,-1.088241
6,1.734249,-1.321831,-1.678965,0.443713
7,-0.445711,0.249139,0.162587,0.719053
8,-0.5125,1.254864,0.688696,-0.529149
9,1.48722,1.335136,0.612478,-1.341393


In [5]:
pieces = [df[:3], df[3:7], df[7:]] # SPLIT A DATAFRAME!
pieces

[          0         1         2         3
 0 -0.158254  0.615733  0.453551 -0.254799
 1  1.766605 -1.291921  0.546628  0.146044
 2 -0.166655 -0.196460 -0.704720 -0.587469,
           0         1         2         3
 3  1.167039  0.685617 -1.510551  0.709407
 4  0.825522  0.431008 -0.272529 -1.750112
 5  0.806591 -0.748442  2.422439 -1.088241
 6  1.734249 -1.321831 -1.678965  0.443713,
           0         1         2         3
 7 -0.445711  0.249139  0.162587  0.719053
 8 -0.512500  1.254864  0.688696 -0.529149
 9  1.487220  1.335136  0.612478 -1.341393]

In [7]:
pd.concat(pieces) # PUT EM BACK TOGETHER

Unnamed: 0,0,1,2,3
0,-0.158254,0.615733,0.453551,-0.254799
1,1.766605,-1.291921,0.546628,0.146044
2,-0.166655,-0.19646,-0.70472,-0.587469
3,1.167039,0.685617,-1.510551,0.709407
4,0.825522,0.431008,-0.272529,-1.750112
5,0.806591,-0.748442,2.422439,-1.088241
6,1.734249,-1.321831,-1.678965,0.443713
7,-0.445711,0.249139,0.162587,0.719053
8,-0.5125,1.254864,0.688696,-0.529149
9,1.48722,1.335136,0.612478,-1.341393


To join two data-frames, we use merge() in Pandas (equivalent to JOIN in SQL).

In [8]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [9]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [10]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [11]:
pd.merge(left, right, on="key") # inner join like SQL

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [12]:
pd.merge(left, right, on='key', how='outer') # how inner, outer, right..

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# Grouping

In [15]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                       'C': np.random.randn(8),
                       'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.154522,-1.070244
1,bar,one,-0.648433,-1.016505
2,foo,two,0.751076,0.360028
3,bar,three,1.373432,0.8644
4,foo,two,1.290616,-0.268787
5,bar,two,-0.928037,0.073151
6,foo,one,2.404792,-0.959017
7,foo,three,0.267901,0.275567


In [16]:
df.groupby("A").sum() # group the A's, sum their rows.

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.203038,-0.078955
foo,4.559862,-1.662453


## group multiple columns!

In [18]:
df.groupby(["A","B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.648433,-1.016505
bar,three,1.373432,0.8644
bar,two,-0.928037,0.073151
foo,one,2.250269,-2.029261
foo,three,0.267901,0.275567
foo,two,2.041692,0.091241


ou cannot apply two aggregation functions in 1 groupby statement in Pandas. However , there is equivalent to
select A
  ,sum(C) as C
  ,max(D) as D
from df
group by A; using .agg() method

In [19]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.203038,0.8644
foo,4.559862,0.360028
