In [3]:
import pandas as pd

technologies   = ({
    'Courses':["Spark","PySpark","Hadoop","Python","Pandas","Hadoop","Spark","Python"],
    'Fee' :[22000,25000,23000,24000,26000,25000,25000,22000],
    'Duration':['30days','50days','55days','40days','60days','35days','55days','50days'],
    'Discount':[1000,2300,1000,1200,2500,1300,1400,1600]})

In [7]:
df = pd.DataFrame(technologies, columns = ['Courses', 'Fee', 'Duration', 'Discount'], index = [1,2,3,4,5,6,7,8])
print(df)

   Courses    Fee Duration  Discount
1    Spark  22000   30days      1000
2  PySpark  25000   50days      2300
3   Hadoop  23000   55days      1000
4   Python  24000   40days      1200
5   Pandas  26000   60days      2500
6   Hadoop  25000   35days      1300
7    Spark  25000   55days      1400
8   Python  22000   50days      1600


In [8]:
df2 = df.groupby('Courses').sum()
print(df2)

           Fee  Discount
Courses                 
Hadoop   48000      2300
Pandas   26000      2500
PySpark  25000      2300
Python   46000      2800
Spark    47000      2400


You can also explicitly specify on which column you wanted to do a sum operation. The below
example applies the sum on the Fee column.

In [9]:
# Use groupby() and compute sum on specific column
df2 = df.groupby('Courses')['Fee'].sum()
print(df2)

Courses
Hadoop     48000
Pandas     26000
PySpark    25000
Python     46000
Spark      47000
Name: Fee, dtype: int64


You can also send a list of columns you wanted group to groupby() method, using this you 
can apply a group by on multiple columns and calculate a sum over each combination group.
For example, df.groupby(['Course', 'Duration']).['Fee'].sum() does group on Courses and 
Duration column and finally calculates the sum

In [14]:
df2 = df.groupby(['Courses', 'Duration',])['Fee'].sum()
print(df2)

Courses  Duration
Hadoop   35days      25000
         55days      23000
Pandas   60days      26000
PySpark  50days      25000
Python   40days      24000
         50days      22000
Spark    30days      22000
         55days      25000
Name: Fee, dtype: int64


Groupby and Get Sum and Count
You can also use df.groupby('Courses')['Fee'].agg(['sum', 'count']) to get both sum() and 
count() on groupby(), you dont want to reset the index

In [16]:
# Groupby and get sum() and count()
df2 = df.groupby('Courses')['Fee'].agg(['sum', 'count'])
print(df2)

           sum  count
Courses              
Hadoop   48000      2
Pandas   26000      1
PySpark  25000      1
Python   46000      2
Spark    47000      2


In [17]:
# Pandas groupby get sum() and count()
df2 = df.groupby('Courses').agg({'Fee':['sum', 'count']})
print(df2)

           Fee      
           sum count
Courses             
Hadoop   48000     2
Pandas   26000     1
PySpark  25000     1
Python   46000     2
Spark    47000     2


Sort Descending order Group By Keys
By default groupby() method sorts results by group key hence it will take additional time, 
if you have a performance issue and don’t want to sort the group by the result, you can turn this off by using the sort=False param.

In [20]:
# Remove sorting on grouped results
df2 = df.groupby(['Courses'], sort=False).sum()
print(df2)

           Fee  Discount
Courses                 
Spark    47000      2400
PySpark  25000      2300
Hadoop   48000      2300
Python   46000      2800
Pandas   26000      2500


# If you wanted to sort key descending order, use below

In [25]:
# Sorting group keys on descending order
groupedDF = df.groupby('Courses', sort=False).sum()
sortedDF = groupedDF.sort_values('Courses', ascending=False)
print(groupedDF)

           Fee  Discount
Courses                 
Spark    47000      2400
PySpark  25000      2300
Hadoop   48000      2300
Python   46000      2800
Pandas   26000      2500
