<h1> Pandas-Groupby</h1>

<h3>Groupby allows you to group together rows based off of a column, and perform an aggregate function on them.</h3>

In [1]:
import numpy as np
import pandas as pd

In [6]:
# Create a dictionary called data

data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
        'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
        'Sales':[200,120,340,124,243,350]}

In [7]:
data

{'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
 'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
 'Sales': [200, 120, 340, 124, 243, 350]}

In [9]:
df = pd.DataFrame(data)

In [10]:
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


<h3>Groupby off of a column name</h3>

In [20]:
df.groupby('Company')       # The output shows you were the object is stored in memory
                            # Assign it to a variable, and add a function in order to see output
                            # Usage: df.groupby('Some column name')

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2c4c7695c0>

In [18]:
byComp = df.groupby('Company')  # Assign the groupby object to some variable
                                # In this example, byComp

In [22]:
byComp.mean()               # Apply some aggregate function
                            # In this example, add the mean method to byComp to see df output

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [24]:
byComp.sum()            # Applying the sum method to the groupby object

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [25]:
byComp.std()           # Applying the standard deviation method to the groupby object

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065


In [27]:
byComp.sum().loc['FB']  # To get information from specific row, using loc
                        # In this eaxample, sum up total for 'FB' row
                        # useage: byComp.sum().loc['FB']

Sales    593
Name: FB, dtype: int64

In [28]:
df.groupby('Company').sum().loc['FB']   # Building out one line df syntax
                                        # Groupby 'Company'
                                        # sum method
                                        # then find a specific row using .loc[]

Sales    593
Name: FB, dtype: int64

In [29]:
df.groupby('Company').sum().loc['GOOG']

Sales    320
Name: GOOG, dtype: int64

In [30]:
df.groupby('Company').sum().loc['MSFT']

Sales    464
Name: MSFT, dtype: int64

In [33]:
df.groupby('Company').count()      # Apply the count method to count the number of things per row

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [34]:
df.groupby('Company').max()      # Apply max ()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Sarah,350
GOOG,Sam,200
MSFT,Vanessa,340


In [35]:
df.groupby('Company').min()    # Apply min ()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Carl,243
GOOG,Charlie,120
MSFT,Amy,124


In [36]:
df.groupby('Company').describe()   # Applying describe() to show details about the df

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [40]:
df.groupby('Company').describe().transpose()  # Transpose to swap columns with rows

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [44]:
df.groupby('Company').describe().transpose()['FB']  # Swap column with row
                                                    # get info for a specific column (ex: 'FB')

Sales  count      2.000000
       mean     296.500000
       std       75.660426
       min      243.000000
       25%      269.750000
       50%      296.500000
       75%      323.250000
       max      350.000000
Name: FB, dtype: float64

In [46]:
df.groupby('Company').describe().transpose()['GOOG']

Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

In [47]:
df.groupby('Company').describe().transpose()['MSFT']

Sales  count      2.000000
       mean     232.000000
       std      152.735065
       min      124.000000
       25%      178.000000
       50%      232.000000
       75%      286.000000
       max      340.000000
Name: MSFT, dtype: float64

<h3>Some groupby examples/exercises</h3>

In [49]:
df.groupby('Person').max()

Unnamed: 0_level_0,Company,Sales
Person,Unnamed: 1_level_1,Unnamed: 2_level_1
Amy,MSFT,340
Carl,FB,243
Charlie,GOOG,120
Sam,GOOG,200
Sarah,FB,350
Vanessa,MSFT,124


In [61]:
df.groupby('Sales').max()

Unnamed: 0_level_0,Company,Person
Sales,Unnamed: 1_level_1,Unnamed: 2_level_1
120,GOOG,Charlie
124,MSFT,Vanessa
200,GOOG,Sam
243,FB,Carl
340,MSFT,Amy
350,FB,Sarah


In [56]:
df.groupby('Sales').max()['Person']

Sales
120    Charlie
124    Vanessa
200        Sam
243       Carl
340        Amy
350      Sarah
Name: Person, dtype: object

In [59]:
df.groupby('Person').count()

Unnamed: 0_level_0,Company,Sales
Person,Unnamed: 1_level_1,Unnamed: 2_level_1
Amy,1,1
Carl,1,1
Charlie,1,1
Sam,1,1
Sarah,1,1
Vanessa,1,1
