## Exercices : Aggregation and grouping

### Aggregation

In [15]:
#  Aggregation Functions in Pandas with Realistic Product Data

# Import necessary libraries
import pandas as pd
import numpy as np

# Consider Weekly sales of different products
data = {
    'Earphones':  [150, 200, 190, 250, 300],
    'Laptop':     [50,  60,  55,  65,  80 ],
    'Cell Phone': [300, 320, 310, 305, 315],
    'Power Bank': [400, 420, 410, 430, 450]
}

# Index represents the week numbers (Week 1 to Week 5)
index = ['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5']

# Create DataFrame
df = pd.DataFrame(data, index=index)
df


Unnamed: 0,Earphones,Laptop,Cell Phone,Power Bank
Week 1,150,50,300,400
Week 2,200,60,320,420
Week 3,190,55,310,410
Week 4,250,65,305,430
Week 5,300,80,315,450


For the following problems it is possible to answer the questions without writing code, but we want to see the correct code!
3) a) Get the total amount of products sold for each week
   b) Get the total amount of products sold for each category
   c) Get the total amount of sold products


In [32]:
# a) Get the total amount of products sold for each week
df.sum(axis=1)

Week 1     900
Week 2    1000
Week 3     965
Week 4    1050
Week 5    1145
dtype: int64

In [33]:
#    b) Get the total amount of products sold for each category
df.sum()


Earphones     1090
Laptop         310
Cell Phone    1550
Power Bank    2110
dtype: int64

In [38]:
#    c) Get the total amount of sold products
weekly_sum = df.sum(axis=1).sum()
weekly_sum.sum()

5060

2) a) Calculate the mean of sold products across weeks
   b) Calculate the median for each week
   c) Calculate the overall mean of products sold per week

In [40]:
# a) Calculate the mean of sold products across weeks
df.mean(axis=1)

Week 1    225.00
Week 2    250.00
Week 3    241.25
Week 4    262.50
Week 5    286.25
dtype: float64

In [43]:
# b) Calculate the median for each week
df.median(axis=1)

Week 1    225.0
Week 2    260.0
Week 3    250.0
Week 4    277.5
Week 5    307.5
dtype: float64

In [45]:
#    c) Calculate the overall mean of products sold per week
df.mean().mean()

253.0

3) a) What was the maximum number of items sold in one week for Cell Phone's
   b) What is the minimum number of items sold in week 3?
   c) What are the minimum sales for each week across products
   3) What are the maximum sales for each product across weeks

In [62]:
# a) What was the maximum number of items sold in one week for Cell Phone's
df.loc[:, 'Cell Phone'].max()

320

In [63]:
# b) What is the minimum number of items sold in week 3?
df.loc['Week 3'].min()

55

In [69]:
# c) What are the minimum sales for each week across products
df.min(axis=1)

Week 1    50
Week 2    60
Week 3    55
Week 4    65
Week 5    80
dtype: int64

In [67]:
# 3) What are the maximum sales for each product across weeks
df.max()

Earphones     300
Laptop         80
Cell Phone    320
Power Bank    450
dtype: int64

4) Provide an overview of all the statistics, mean, median, standard deviation, min, max with one command

In [70]:
df.describe()

Unnamed: 0,Earphones,Laptop,Cell Phone,Power Bank
count,5.0,5.0,5.0,5.0
mean,218.0,62.0,310.0,422.0
std,58.051701,11.510864,7.905694,19.235384
min,150.0,50.0,300.0,400.0
25%,190.0,55.0,305.0,410.0
50%,200.0,60.0,310.0,420.0
75%,250.0,65.0,315.0,430.0
max,300.0,80.0,320.0,450.0


5) a) Count the number of items in a row
   b) Count the number of items in a column


In [80]:
# a) Count the number of items in a row
df.count(axis=0)

Earphones     5
Laptop        5
Cell Phone    5
Power Bank    5
dtype: int64

In [81]:
# b) Count the number of items in a column
df.count(axis=1)

Week 1    4
Week 2    4
Week 3    4
Week 4    4
Week 5    4
dtype: int64

### Grouping

First consider the dataframe df

In [298]:
import pandas as pd
import numpy as np

# Original data with computer-related products
data = {
    'productName': ['Laptop', 'Monitor', 'Mouse', 'Keyboard', 'External Hard Drive'] * 4,
    'week': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4],
    'amount_sold': [120, 80, 50, 40, 90, 130, 85, 60, 55, 95, 125, 90, 70, 45, 100, 140, 100, 65, 50, 110],
    'price': [999.99, 199.99, 25.99, 49.99, 79.99, 999.99, 199.99, 25.99, 49.99, 79.99, 999.99, 199.99, 25.99, 49.99, 79.99, 999.99, 199.99, 25.99, 49.99, 79.99]
}

# Additional data
additional_data = {
    'productName': ['Graphics Card', 'Gaming Chair', 'Headset', 'Webcam', 'Router'] * 4,
    'week': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4],
    'amount_sold': [30, 15, 60, 25, 45, 50, 20, 75, 35, 40, 60, 25, 80, 30, 50, 65, 35, 85, 40, 55],
    'price': [599.99, 150.99, 49.99, 89.99, 129.99, 599.99, 150.99, 49.99, 89.99, 129.99, 599.99, 150.99, 49.99, 89.99, 129.99, 599.99, 150.99, 49.99, 89.99, 129.99]
}

# Creating DataFrames
df_computer = pd.DataFrame(data)
df_additional = pd.DataFrame(additional_data)

# Concatenating the new data with the existing DataFrame
df_combined = pd.concat([df_computer, df_additional], ignore_index=True)

# Shuffling the rows of the combined DataFrame
df = df_combined.sample(frac=1).reset_index(drop=True)

# Displaying the shuffled DataFrame
df.count(axis=0)
# df


productName    40
week           40
amount_sold    40
price          40
dtype: int64

6) a) Calculate the total number of units sold for each product across all weeks.
   b) Calculate the average price for each product.
   c) Calculate the total revenue for each week.

In [89]:
# a) Calculate the total number of units sold for each product across all weeks.
# print(df)
df.groupby('week')['amount_sold'].sum()

week
1    555
2    645
3    675
4    745
Name: amount_sold, dtype: int64

In [93]:
# b) Calculate the average price for each product.
df.groupby('productName')['price'].mean()

productName
External Hard Drive     79.99
Gaming Chair           150.99
Graphics Card          599.99
Headset                 49.99
Keyboard                49.99
Laptop                 999.99
Monitor                199.99
Mouse                   25.99
Router                 129.99
Webcam                  89.99
Name: price, dtype: float64

In [113]:
# c) Calculate the total revenue for each week.
df.groupby('week')['revenu'].sum()

week
1    177859.45
2    204023.55
3    208038.25
4    232267.55
Name: revenu, dtype: float64

7) Find the product that sold the most units in each week.


In [214]:
df_max_sold_per_week = df.loc[df.groupby('week')['amount_sold'].idxmax()]
df_max_sold_per_week[['week', 'amount_sold', 'productName']]

Unnamed: 0,week,amount_sold,productName
17,1,120,Laptop
39,2,130,Laptop
35,3,125,Laptop
11,4,140,Laptop


8) Calculate the average number of units sold per product.

In [217]:
print(df.groupby('productName')['amount_sold'].mean())

productName
External Hard Drive     98.75
Gaming Chair            23.75
Graphics Card           51.25
Headset                 75.00
Keyboard                47.50
Laptop                 128.75
Monitor                 88.75
Mouse                   61.25
Router                  47.50
Webcam                  32.50
Name: amount_sold, dtype: float64


9) For each product provide a list of the minimum amount sold, the average and the maximum amount sold



In [218]:
df.groupby('productName')['amount_sold'].aggregate(['min', 'mean', 'max'])

Unnamed: 0_level_0,min,mean,max
productName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
External Hard Drive,90,98.75,110
Gaming Chair,15,23.75,35
Graphics Card,30,51.25,65
Headset,60,75.0,85
Keyboard,40,47.5,55
Laptop,120,128.75,140
Monitor,80,88.75,100
Mouse,50,61.25,70
Router,40,47.5,55
Webcam,25,32.5,40


10) Provide a table with the productnames and the minimum price for that product the total amount sold and the total revenue for that product

In [227]:
# pd.merge(pd.merge(df.groupby('productName')['price'].min(), df.groupby('productName')['amount_sold'].sum(), left_on='productName', right_on='productName'), df.groupby('productName')['revenu'].sum(), left_on='productName', right_on='productName')
df.groupby('productName').aggregate({'price':'min', 
                                     'amount_sold':'sum',
                                     'revenu':'sum'})
# df

Unnamed: 0_level_0,price,amount_sold,revenu
productName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
External Hard Drive,79.99,395,31596.05
Gaming Chair,150.99,95,14344.05
Graphics Card,599.99,205,122997.95
Headset,49.99,300,14997.0
Keyboard,49.99,190,9498.1
Laptop,999.99,515,514994.85
Monitor,199.99,355,70996.45
Mouse,25.99,245,6367.55
Router,129.99,190,24698.1
Webcam,89.99,130,11698.7


11) Only show the productnames which have  sold less than 150 in total over all the dataset and show also the amount_sold in total

In [271]:
def filter_by_sold(x):
    return x['amount_sold'].sum() < 150

less_150 = df.groupby('productName').filter(filter_by_sold)
less_150.groupby('productName')['amount_sold'].sum()


# less_150 = df.groupby('productName')['amount_sold'].sum() < 150
# # less_150[:, 'amount_sold']
# df_less = pd.DataFrame(less_150)
# df_less['productName'] = df_less.index
# df_less.index = range(10)
# df_less_shortened = df_less.loc[df_less['amount_sold'] == True, 'productName']
# final_df = pd.DataFrame({'productName':df_less_shortened})
# df.groupby('productName')
# print(final_df)

# df[df.groupby('productName')['amount_sold'].sum() < 150]
# df[df['amount_sold'] > 150]

productName
Gaming Chair     95
Webcam          130
Name: amount_sold, dtype: int64

12) For each product calculate the total revenue, based on the price of the product and the total_amount sold.


In [317]:
def total_revenue(x):
    x['total_revenue'] = sum(x['price']*x['amount_sold'])
    return x
    
total_revenue_product = df.groupby('productName').apply(total_revenue, include_groups=False)
print(total_revenue_product[['total_revenue']])

                        total_revenue
productName                          
External Hard Drive 29       31596.05
                    30       31596.05
                    35       31596.05
                    36       31596.05
Gaming Chair        27       14344.05
                    34       14344.05
                    38       14344.05
                    39       14344.05
Graphics Card       5       122997.95
                    7       122997.95
                    10      122997.95
                    24      122997.95
Headset             11       14997.00
                    19       14997.00
                    21       14997.00
                    33       14997.00
Keyboard            6         9498.10
                    8         9498.10
                    14        9498.10
                    25        9498.10
Laptop              9       514994.85
                    13      514994.85
                    16      514994.85
                    31      514994.85
Monitor     