In [1]:
import  pandas as pd

In [13]:
df_cars = pd.read_csv("Car_sales.csv")

In [14]:
# selecting columns
df_cars = df_cars[['Manufacturer', 'Sales_in_thousands', 'Vehicle_type', 'Price_in_thousands', 'Engine_size', 'Horsepower', 'Fuel_capacity']]

In [15]:
df_cars.head()

Unnamed: 0,Manufacturer,Sales_in_thousands,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Fuel_capacity
0,Acura,16.919,Passenger,21.5,1.8,140.0,13.2
1,Acura,39.384,Passenger,28.4,3.2,225.0,17.2
2,Acura,14.114,Passenger,,3.2,225.0,17.2
3,Acura,8.588,Passenger,42.0,3.5,210.0,18.0
4,Audi,20.397,Passenger,23.99,1.8,150.0,16.4


# agg() Function

In [10]:
df_cars.agg('sum')

Manufacturer          AcuraAcuraAcuraAcuraAudiAudiAudiBMWBMWBMWBuick...
Sales_in_thousands                                             8320.698
Vehicle_type          PassengerPassengerPassengerPassengerPassengerP...
Price_in_thousands                                             4245.567
Engine_size                                                       477.5
Horsepower                                                      29008.0
Fuel_capacity                                                    2800.5
dtype: object

In [23]:
# Different aggregations per column: calculate the sum and mean of 'Sales_in_thousands' and sum and max of 'Price_in_thousands'
df_cars.agg({'Sales_in_thousands' : ['sum', 'mean'], 'Price_in_thousands' : ['sum', 'max']})

Unnamed: 0,Sales_in_thousands,Price_in_thousands
sum,8320.698,4245.567
mean,52.998076,
max,,85.5


In [17]:
# aggregate over the columns
df_cars[['Sales_in_thousands','Price_in_thousands']].agg('sum',axis=1)

0      38.419
1      67.784
2      14.114
3      50.588
4      44.387
        ...  
152    27.945
153    42.745
154    46.331
155    48.993
156    54.969
Length: 157, dtype: float64

In [18]:
# equivalent (NaN behaves differently)
df_cars['Sales_in_thousands'] + df_cars['Price_in_thousands']

0      38.419
1      67.784
2         NaN
3      50.588
4      44.387
        ...  
152    27.945
153    42.745
154    46.331
155    48.993
156    54.969
Length: 157, dtype: float64

In [19]:
# aggregate different functions and rename the index of the resulting DataFrame
df_cars.agg(x=('Sales_in_thousands','sum'),y=('Price_in_thousands','sum'))

Unnamed: 0,Sales_in_thousands,Price_in_thousands
x,8320.698,
y,,4245.567


# The Split-Apply-Combine Strategy

In [22]:
# split data into sep groups
car_filter = df_cars['Vehicle_type'] == 'Car'
passenger_filter = df_cars['Vehicle_type'] == 'Passenger'

In [25]:
# apply an operation
car_avg = df_cars[car_filter]['Sales_in_thousands'].mean()
passenger_avg = df_cars[passenger_filter]['Sales_in_thousands'].mean()

In [26]:
# combine
pd.DataFrame({'Vehicle_type':['Car','Passenger'],
              'Sales_in_thousands':[car_avg,passenger_avg]}).set_index('Vehicle_type')

Unnamed: 0_level_0,Sales_in_thousands
Vehicle_type,Unnamed: 1_level_1
Car,80.622293
Passenger,43.234345


# groupby()

In [32]:
# group by Vehicle_type and calculate the mean
df_cars.groupby('Vehicle_type')['Sales_in_thousands'].mean()

Vehicle_type
Car          80.622293
Passenger    43.234345
Name: Sales_in_thousands, dtype: float64

In [36]:
df_cars.groupby('Manufacturer')[
    df_cars.select_dtypes('number').columns.difference(['Vehicle_type'])
].mean()


Unnamed: 0_level_0,Engine_size,Fuel_capacity,Horsepower,Price_in_thousands,Sales_in_thousands
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Acura,2.925,16.4,200.0,30.633333,19.75125
Audi,2.933333,19.533333,220.0,39.98,13.519
BMW,2.7,17.233333,185.333333,33.096667,15.501667
Buick,3.625,17.75,206.25,26.78125,60.50475
Cadillac,4.5,20.8,256.0,40.254,22.4356
Chevrolet,3.055556,15.477778,171.111111,20.022778,61.596111
Chrysler,2.783333,16.483333,194.833333,23.430833,28.817286
Dodge,3.709091,21.581818,199.545455,24.213636,82.740818
Ford,3.327273,19.063636,170.090909,21.047273,183.875909
Honda,2.52,17.08,160.4,20.277,118.5348


In [37]:
df_cars.groupby('Manufacturer')[
    df_cars.select_dtypes('number').columns.difference(['Vehicle_type'])
].sum()


Unnamed: 0_level_0,Engine_size,Fuel_capacity,Horsepower,Price_in_thousands,Sales_in_thousands
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Acura,11.7,65.6,800.0,91.9,79.005
Audi,8.8,58.6,660.0,119.94,40.557
BMW,8.1,51.7,556.0,99.29,46.505
Buick,14.5,71.0,825.0,107.125,242.019
Cadillac,22.5,104.0,1280.0,201.27,112.178
Chevrolet,27.5,139.3,1540.0,180.205,554.365
Chrysler,16.7,98.9,1169.0,140.585,201.721
Dodge,40.8,237.4,2195.0,266.35,910.149
Ford,36.6,209.7,1871.0,231.52,2022.635
Honda,12.6,85.4,802.0,101.385,592.674


In [38]:
# grou by Vehicle_type and count values
df_cars.groupby('Vehicle_type').count()

Unnamed: 0_level_0,Manufacturer,Sales_in_thousands,Price_in_thousands,Engine_size,Horsepower,Fuel_capacity
Vehicle_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Car,41,41,40,40,40,40
Passenger,116,116,115,116,116,116


In [39]:
# find null values in columns
df_cars.isnull().sum()

Manufacturer          0
Sales_in_thousands    0
Vehicle_type          0
Price_in_thousands    2
Engine_size           1
Horsepower            1
Fuel_capacity         1
dtype: int64

In [40]:
# grou by Engine_size and count values without dropping NaN values
df_cars.groupby('Engine_size', dropna=False).count()

Unnamed: 0_level_0,Manufacturer,Sales_in_thousands,Vehicle_type,Price_in_thousands,Horsepower,Fuel_capacity
Engine_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,1,1,1,1,1,1
1.5,1,1,1,1,1,1
1.6,1,1,1,1,1,1
1.8,8,8,8,8,8,8
1.9,5,5,5,5,5,5
2.0,17,17,17,17,17,17
2.2,4,4,4,4,4,4
2.3,6,6,6,6,6,6
2.4,11,11,11,11,11,11
2.5,11,11,11,11,11,11


# Groupby() and agg()

In [41]:
# find the minimum and maximum values on each column
df_cars.groupby('Vehicle_type').agg(['min', 'max'])

Unnamed: 0_level_0,Manufacturer,Manufacturer,Sales_in_thousands,Sales_in_thousands,Price_in_thousands,Price_in_thousands,Engine_size,Engine_size,Horsepower,Horsepower,Fuel_capacity,Fuel_capacity
Unnamed: 0_level_1,min,max,min,max,min,max,min,max,min,max,min,max
Vehicle_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Car,Cadillac,Toyota,9.126,540.561,11.528,60.105,2.0,5.7,119.0,300.0,15.1,32.0
Passenger,Acura,Volvo,0.11,247.994,9.235,85.5,1.0,8.0,55.0,450.0,10.3,23.7


In [42]:

# set a name for aggregated column
# find the min Engine_size and max Horsepower
df_cars.groupby('Vehicle_type').agg(min_engine_size=('Engine_size', 'min'),
                                    max_horsepower=('Horsepower', 'max'))

Unnamed: 0_level_0,min_engine_size,max_horsepower
Vehicle_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Car,2.0,300.0
Passenger,1.0,450.0


In [47]:
df_cars.groupby('Manufacturer').agg(sum_sales=('Sales_in_thousands', 'sum'),
                                    mean_price=('Price_in_thousands', 'mean')).head()

Unnamed: 0_level_0,sum_sales,mean_price
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1
Acura,79.005,30.633333
Audi,40.557,39.98
BMW,46.505,33.096667
Buick,242.019,26.78125
Cadillac,112.178,40.254


In [46]:
df_cars.groupby("Manufacturer").agg({
    "Horsepower": lambda x: x.max() - x.min(),  # horsepower range
    "Price_in_thousands": lambda x: x.median(), # median price
    "Sales_in_thousands": "sum"
}).head()


Unnamed: 0_level_0,Horsepower,Price_in_thousands,Sales_in_thousands
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Acura,85.0,28.4,79.005
Audi,160.0,33.95,40.557
BMW,23.0,33.4,46.505
Buick,65.0,26.5925,242.019
Cadillac,75.0,39.895,112.178


In [49]:
summary = df_cars.groupby("Manufacturer").agg({
    "Sales_in_thousands": "sum",
    "Horsepower": "mean"
}).reset_index()

summary["Sales_per_HP"] = summary["Sales_in_thousands"] / summary["Horsepower"]
summary.sort_values("Sales_per_HP", ascending=False)


Unnamed: 0,Manufacturer,Sales_in_thousands,Horsepower,Sales_per_HP
8,Ford,2022.635,170.090909,11.891494
27,Toyota,740.205,160.666667,4.607085
7,Dodge,910.149,199.545455,4.561111
9,Honda,592.674,160.4,3.694975
5,Chevrolet,554.365,171.111111,3.239795
19,Nissan,399.635,169.0,2.364704
22,Pontiac,370.534,185.0,2.002886
13,Jeep,293.153,168.333333,1.741503
28,Volkswagen,209.212,120.833333,1.73141
17,Mercury,237.999,163.833333,1.45269


# Lambda


In [50]:
df_cars.groupby('Manufacturer').sum()[['Sales_in_thousands', 'Price_in_thousands']].apply(lambda x: x*1000).head()

Unnamed: 0_level_0,Sales_in_thousands,Price_in_thousands
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1
Acura,79005.0,91900.0
Audi,40557.0,119940.0
BMW,46505.0,99290.0
Buick,242019.0,107125.0
Cadillac,112178.0,201270.0


In [56]:
num_cols = df_cars.select_dtypes('number').columns
df_cars.groupby('Manufacturer')[num_cols].transform(lambda x: x - x.mean())[
    ['Sales_in_thousands', 'Price_in_thousands']
]



Unnamed: 0,Sales_in_thousands,Price_in_thousands
0,-2.832250,-9.133333
1,19.632750,-2.233333
2,-5.637250,
3,-11.163250,11.366667
4,6.878000,-15.990000
...,...,...
152,-9.078333,-6.533333
153,2.621667,-3.433333
154,4.907667,-2.133333
155,-9.130333,14.566667
