# Demo 3.6 & 3.7 *groupby()*: Aggregating On *Two* Columns

 
**requires data file:  `Cars.csv`


In [1]:
import pandas as pd 

### Read the datafile File into a *pandas* Dataframe  

In [2]:
df = pd.read_csv('Cars.csv')

print(df.shape)
df.head(2)

(428, 13)


Unnamed: 0,Vehicle_Make,Vehicle_Model,Vehicle_Type,Manufacturing_Origin,MPG_City,MPG_Hwy,MSRP,Invoice,Weight,Wheelbase,DriveTrain,EngineSize,Horsepower
0,Acura,MDX,SUV,Asia,17,23,36945,33337,4451,106,All,3.5,265
1,Acura,RSX Type S 2dr,Sedan,Asia,24,31,23820,21761,2778,101,Front,2.0,200


# Change Data Types as Needed  

In [3]:
# data Vehicle_Types 'Before' 
df.dtypes

Vehicle_Make             object
Vehicle_Model            object
Vehicle_Type             object
Manufacturing_Origin     object
MPG_City                  int64
MPG_Hwy                   int64
MSRP                      int64
Invoice                   int64
Weight                    int64
Wheelbase                 int64
DriveTrain               object
EngineSize              float64
Horsepower                int64
dtype: object

In [4]:
# Convert MSRP, Invoice, MPG_City, MPG_Hwy to floats
df['MSRP'] = df['MSRP'].astype(float)
df['Invoice'] = df['Invoice'].astype(float)

df['MPG_City'] = df['MPG_City'].astype(float)
df['MPG_Hwy'] = df['MPG_Hwy'].astype(float)

In [5]:
# data Vehicle_Types 'After' 
df.dtypes

Vehicle_Make             object
Vehicle_Model            object
Vehicle_Type             object
Manufacturing_Origin     object
MPG_City                float64
MPG_Hwy                 float64
MSRP                    float64
Invoice                 float64
Weight                    int64
Wheelbase                 int64
DriveTrain               object
EngineSize              float64
Horsepower                int64
dtype: object

# Question:  What is the Average City MPG By Vehicle Type?  
- Categorical Variable to Group On:    
- Continuous Variable We're Interested In:   
- Aggregation Function:   



In [6]:
# Optional:  Display the unique values of the column we want to Group on
df['Vehicle_Type'].unique()

array(['SUV', 'Sedan', 'Sports', 'Wagon', 'Truck', 'Hybrid'], dtype=object)

# Group on One Column:  *Vehicle_Type* 


In [7]:
df_grouped = df.groupby('Vehicle_Type').mean(numeric_only=True).reset_index()
print(type(df_grouped))
df_grouped

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Vehicle_Type,MPG_City,MPG_Hwy,MSRP,Invoice,Weight,Wheelbase,EngineSize,Horsepower
0,Hybrid,55.0,56.0,19920.0,18429.333333,2490.666667,101.333333,1.633333,92.0
1,SUV,16.1,20.5,34790.25,31625.35,4444.433333,111.083333,3.92,235.816667
2,Sedan,21.083969,28.629771,29773.618321,27389.79771,3399.064885,108.0,2.970992,201.656489
3,Sports,18.408163,25.489796,53387.061224,48473.163265,3295.693878,99.979592,3.442857,284.163265
4,Truck,16.5,21.0,24941.375,22616.75,4250.75,123.0,4.079167,224.833333
5,Wagon,21.1,27.9,28840.533333,26645.633333,3438.8,105.8,2.77,194.0


In [8]:
df_grouped = df.groupby('Vehicle_Type').mean(numeric_only=True).reset_index()
df_grouped.drop(columns=['MPG_Hwy', 'MSRP', 'Invoice', 'Weight', 'Wheelbase', 'EngineSize', 'Horsepower'], inplace=True)

print("3.6 Demo:  Grouping on One Column")
print(df_grouped.shape)
df_grouped.head()

3.6 Demo:  Grouping on One Column
(6, 2)


Unnamed: 0,Vehicle_Type,MPG_City
0,Hybrid,55.0
1,SUV,16.1
2,Sedan,21.083969
3,Sports,18.408163
4,Truck,16.5


---

# Demo Question 1:  What is the Average City MPG By Vehicle Type and Manufacturing Origin?  

In [9]:
df.columns

Index(['Vehicle_Make', 'Vehicle_Model', 'Vehicle_Type', 'Manufacturing_Origin',
       'MPG_City', 'MPG_Hwy', 'MSRP', 'Invoice', 'Weight', 'Wheelbase',
       'DriveTrain', 'EngineSize', 'Horsepower'],
      dtype='object')

In [10]:
df_Q1 = df.groupby(['Vehicle_Type', 'Manufacturing_Origin'])['MPG_City'].mean().reset_index()
print("3.7 Demo Questions 1:  Grouping on Two Columns")
print(df_Q1.shape)
df_Q1.head(10)

3.7 Demo Questions 1:  Grouping on Two Columns
(15, 3)


Unnamed: 0,Vehicle_Type,Manufacturing_Origin,MPG_City
0,Hybrid,Asia,55.0
1,SUV,Asia,17.32
2,SUV,Europe,14.5
3,SUV,USA,15.52
4,Sedan,Asia,22.840426
5,Sedan,Europe,19.512821
6,Sedan,USA,20.611111
7,Sports,Asia,20.235294
8,Sports,Europe,17.652174
9,Sports,USA,16.888889


# Demo Question 2: How many Vehicle Makes are there by Vehicle Type and Manufacturing Origin?

In [11]:
df_Q2 = df.groupby(['Vehicle_Type', 'Manufacturing_Origin'])['Vehicle_Make'].count().reset_index()
df_Q2.columns = ['Vehicle_Type', 'Manufacturing_Origin', 'Count_of_Vehicle_Make']
print("3.7 Demo Questions 2:  Grouping on Two Columns")
print(df_Q2.shape)
df_Q2

3.7 Demo Questions 2:  Grouping on Two Columns
(15, 3)


Unnamed: 0,Vehicle_Type,Manufacturing_Origin,Count_of_Vehicle_Make
0,Hybrid,Asia,3
1,SUV,Asia,25
2,SUV,Europe,10
3,SUV,USA,25
4,Sedan,Asia,94
5,Sedan,Europe,78
6,Sedan,USA,90
7,Sports,Asia,17
8,Sports,Europe,23
9,Sports,USA,9


# Demo Question 3: What is the average Horsepower by Vehicle Make and Vehicle Type?
- Display the groups where the average horsepower is above 300.

In [12]:
df_Q3 = df.groupby(['Vehicle_Make', 'Vehicle_Type'])['Horsepower'].mean().reset_index()
df_Q3.query('Horsepower > 300', inplace=True)
print("3.7 Demo Questions 3:  Grouping on Two Columns")
print(df_Q3.shape)
df_Q3

3.7 Demo Questions 3:  Grouping on Two Columns
(12, 3)


Unnamed: 0,Vehicle_Make,Vehicle_Type,Horsepower
12,Cadillac,SUV,307.5
14,Cadillac,Sports,320.0
15,Cadillac,Truck,345.0
18,Chevrolet,Sports,350.0
26,Dodge,Sports,500.0
40,Hummer,SUV,316.0
48,Jaguar,Sports,342.0
58,Lincoln,SUV,301.0
67,Mercedes-Benz,Sports,365.8
84,Pontiac,Sports,340.0


# Demo Question 4: What is the maximum MSRP by Manufacturing Origin and Drive Train?

In [13]:
df_Q4 = df.groupby(['Manufacturing_Origin', 'DriveTrain'])['MSRP'].max().reset_index()
print("3.7 Demo Questions 4:  Grouping on Two Columns")
print(df_Q4.shape)
df_Q4

3.7 Demo Questions 4:  Grouping on Two Columns
(9, 3)


Unnamed: 0,Manufacturing_Origin,DriveTrain,MSRP
0,Asia,All,64800.0
1,Asia,Front,46100.0
2,Asia,Rear,89765.0
3,Europe,All,86970.0
4,Europe,Front,84600.0
5,Europe,Rear,192465.0
6,USA,All,52975.0
7,USA,Front,52795.0
8,USA,Rear,81795.0


In [14]:
df = pd.read_excel('Sample - Superstore.xlsx', sheet_name='Orders')

print(df.shape)
df.head(2)

(9994, 21)


  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0,Row ID,Order ID,Order_Date,Ship Date,Ship Mode,Customer ID,Customer Name,Business_Segment,Country/Region,City,...,Zipcode,Region,Product_ID,Category,Subcategory,Product_Name,Sales,Quantity,Discount,Profit
0,1,CA-2019-152156,2019-11-08,2019-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2019-152156,2019-11-08,2019-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582


In [15]:
print("3.7 Part 1: Average sales by Category and Segment")
df_Q1 = df.groupby(['Category', 'Business_Segment'])[['Sales', 'Profit']].mean().reset_index()
print(df_Q1.shape)
df_Q1

3.7 Part 1: Average sales by Category and Segment
(9, 4)


Unnamed: 0,Category,Business_Segment,Sales,Profit
0,Furniture,Consumer,351.347091,6.281293
1,Furniture,Corporate,354.519792,11.741201
2,Furniture,Home Office,336.825131,10.705465
3,Office Supplies,Consumer,116.390194,18.014174
4,Office Supplies,Corporate,126.745309,22.102923
5,Office Supplies,Home Office,115.309021,24.034439
6,Technology,Consumer,427.339534,74.445646
7,Technology,Corporate,444.85581,79.723823
8,Technology,Home Office,535.976658,89.152458


In [16]:
print("3.7 Part 2: Average Sales and Profit by Region and Category")
df_Q2 = df.groupby(['Region', 'Category'])[['Sales', 'Profit']].mean().reset_index()
print(df_Q2.shape)
df_Q2

3.7 Part 2: Average Sales and Profit by Region and Category
(12, 4)


Unnamed: 0,Region,Category,Sales,Profit
0,Central,Furniture,340.534644,-5.968918
1,Central,Office Supplies,117.458801,6.244712
2,Central,Technology,405.753124,80.231981
3,East,Furniture,346.574383,5.068496
4,East,Office Supplies,120.044425,23.957114
5,East,Technology,495.278469,88.714084
6,South,Furniture,353.309289,20.395199
7,South,Office Supplies,126.282727,20.086827
8,South,Technology,507.753952,68.231506
9,West,Furniture,357.302325,16.272914


In [17]:
print("3.7 Part 3: Total Sales and Profit by Category and Subcategory")
df_Q3 = df.groupby(['Category', 'Subcategory'])[['Sales', 'Profit']].sum().reset_index()
print(df_Q3.shape)
df_Q3

3.7 Part 3: Total Sales and Profit by Category and Subcategory
(17, 4)


Unnamed: 0,Category,Subcategory,Sales,Profit
0,Furniture,Bookcases,114879.9963,-3472.556
1,Furniture,Chairs,328449.103,26590.1663
2,Furniture,Furnishings,91705.164,13059.1436
3,Furniture,Tables,206965.532,-17725.4811
4,Office Supplies,Appliances,107532.161,18138.0054
5,Office Supplies,Art,27118.792,6527.787
6,Office Supplies,Binders,203412.733,30221.7633
7,Office Supplies,Envelopes,16476.402,6964.1767
8,Office Supplies,Fasteners,3024.28,949.5182
9,Office Supplies,Labels,12486.312,5546.254
