# Aggregation and Grouping Example - South Carolina Pay Dataset

Demonstration of querying the dataset for "interesting" information.  

In [None]:
import pandas as pd
# show - displays the Pandas object 
def show(data, show_data = 0):
    print ("  Index: {:}".format(data.index))
    print ("Columns: {:}".format(data.columns))
    print ("  Shape: {:}".format(data.shape))
    if show_data:
        print(data.values)

print("Pandas version: {:}".format(pd.__version__))

In [None]:
# Read the dataset into a data frame (df)
df = pd.read_csv("../data/04_South_Carolina_State_Employee_Salary_Database.csv")
show(df)
df.head(10)

## Using the whole dataset

In [None]:
# Average salary
df['Base Salary'].mean()

In [None]:
# Human-friendly form
"Average salary: ${:,.2f}".format(df['Base Salary'].mean())

In [None]:
# Medians are often more informing for highly skewed data
"Median salary: ${:,.2f}".format(df['Base Salary'].median())

In [None]:
# Percentiles can also be very useful
p = 0.75
"{:}th percentile salary: ${:,.2f}".format(int(p*100),df['Base Salary'].quantile(p))

## Using Aggregation and Grouping

In [None]:
# Average salary by Agency
df.groupby('Agency')[['Base Salary']].mean()
# Split-apply-combine -- see slide.

In [None]:
# Who pays the most?  Top 10 (of the 96 agencies)
df.groupby('Agency')[['Base Salary']].mean().sort_values(by='Base Salary', ascending=False)[:10]
# how does this expression work? Split-apply-combine + sort + slice

In [None]:
# Who are those Retirement System people?
df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"]

In [None]:
# Sort those retirement people
df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"].sort_values(by='Base Salary', ascending=False)

In [None]:
# Average may be deceiving -- let's check the median
avg = df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"]['Base Salary'].mean()
med = df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"]['Base Salary'].median()
"Average: ${:,.2f}; Median: ${:,.2f}".format(avg, med)

In [None]:
# What about Total Compensation?
avg = df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"]['Total Compensation'].mean()
med = df[df['Agency']=="RETIREMENT SYS INVESTMENT COMM"]['Total Compensation'].median()
"Average: ${:,.2f}; Median: ${:,.2f}".format(avg, med)

In [None]:
# Who pays the least?  Bottom 10
df.groupby('Agency')[['Base Salary']].mean().sort_values(by='Base Salary', ascending=False)[-10:]

In [None]:
# By job title - Who pays the most?  Top 20
df.groupby('Title')[['Base Salary']].mean().sort_values(by='Base Salary', ascending=False)[:20]

In [None]:
# Who are these Vice Provost people?
df[df['Title'] == "VICE PROVOST"].sort_values(by='Base Salary', ascending=False)

In [None]:
# By job title - Who pays the least?  Bottom 20
df.groupby('Title')[['Base Salary']].mean().sort_values(by='Base Salary', ascending=False)[-20:]

In [None]:
# What about professors?
df[df['Title']=='PROFESSOR'].groupby('Agency')[['Base Salary']].mean().sort_values(by='Base Salary', ascending=False)

In [None]:
# Top n professors (by pay :-))
n = 25
df[df['Title']=='PROFESSOR'].sort_values(by='Base Salary', ascending=False)[:n]

In [None]:
# bottom n professors (by pay :-))
n = 25
df[df['Title']=='PROFESSOR'].sort_values(by='Base Salary', ascending=False)[-n:]