# Practice session on pandas

Reminder: https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

In [3]:
import numpy as np
import pandas as pd

### Task 0: Read data
Read in the LEGO.csv file and save it in a DataFrame called `LEGO_all`.
Then create a DataFrame `LEGO`, which only contains the current models.

In [4]:
LEGO_all = pd.read_csv("LEGO.csv")
LEGO_all.head()

Unnamed: 0,name,theme,product_code,ageRange,pieceCount,price,minifigureCount,buildHeight,buildWidth,buildDepth,url,date
0,Notre-Dame de Paris,Architecture,21061,18+,4383.0,229.99,,,,,/de-de/product/notre-dame-de-paris-21061,2024-10-12
1,Burg Himeji,Architecture,21060,18+,2125.0,159.99,,,,,/de-de/product/himeji-castle-21060,2024-10-12
2,Cheops-Pyramide,Architecture,21058,18+,1476.0,139.99,,,,,/de-de/product/great-pyramid-of-giza-21058,2024-10-12
3,Freiheitsstatue,Architecture,21042,16+,1685.0,99.99,,,,,/de-de/product/statue-of-liberty-21042,2024-10-12
4,Paris,Architecture,21044,12+,649.0,49.99,,,,,/de-de/product/paris-21044,2024-10-12


In [18]:
LEGO_all["date"] = pd.to_datetime(LEGO_all["date"])
LEGO = LEGO_all[LEGO_all["date"].dt.year >= 2024]
LEGO

Unnamed: 0,name,theme,product_code,ageRange,pieceCount,price,minifigureCount,buildHeight,buildWidth,buildDepth,url,date,merch
0,Notre-Dame de Paris,Architecture,21061,18+,4383.0,229.99,,,,,/de-de/product/notre-dame-de-paris-21061,2024-10-12,False
1,Burg Himeji,Architecture,21060,18+,2125.0,159.99,,,,,/de-de/product/himeji-castle-21060,2024-10-12,False
2,Cheops-Pyramide,Architecture,21058,18+,1476.0,139.99,,,,,/de-de/product/great-pyramid-of-giza-21058,2024-10-12,False
3,Freiheitsstatue,Architecture,21042,16+,1685.0,99.99,,,,,/de-de/product/statue-of-liberty-21042,2024-10-12,False
4,Paris,Architecture,21044,12+,649.0,49.99,,,,,/de-de/product/paris-21044,2024-10-12,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230,Spinnen-Kran,Technic,42097,10+,920.0,87.72,,,,,/de-de/product/compact-crawler-crane-42097,2024-03-28,False
2231,Löschflugzeug,Technic,42152,10+,1134.0,109.99,,,,,/de-de/product/firefighter-aircraft-42152,2024-03-28,False
2232,Ferrari Daytona SP3 The Sense of Perfection,Technic,5007627,6+,0.0,59.99,,,,,/de-de/product/ferrari-daytona-sp3-the-sense-o...,2024-03-28,False
2233,Rätselspaß für Retter in der Not,City,5007359,6+,0.0,4.99,,,,,/de-de/product/ratselspa-fur-retter-in-der-not...,2024-03-28,False


### Task 1: Identify specific models

Your nephew's birthday is coming up. Since he loves playing police, you want to find all current models that contain the word "Polizei" in the name and cost at most 20€.

(Note that the name is of type `str`. A reference of str-specific functions can be found here:
https://pandas.pydata.org/docs/reference/series.html#string-handling Since the name should **contain** a specific word, you might easily guess the correct function.)

In [19]:
LEGO[(LEGO["name"].str.contains("Polizei")) & (LEGO["price"] < 20)]

Unnamed: 0,name,theme,product_code,ageRange,pieceCount,price,minifigureCount,buildHeight,buildWidth,buildDepth,url,date,merch
121,Mobiles Polizeihunde-Training,City,60369,5+,197.0,19.99,,,,,/de-de/product/mobile-police-dog-training-60369,2024-10-12,False
130,Verfolgungsjagd mit Polizeiauto und Muscle Car,City,60415,6+,213.0,19.99,,,,,/de-de/product/police-car-and-muscle-car-chase...,2024-10-12,False
135,Polizeiauto,City,60312,5+,94.0,9.99,,,,,/de-de/product/police-car-60312,2024-10-12,False
153,Verfolgungsjagd mit dem Polizeimotorrad,City,60392,5+,59.0,9.99,,,,,/de-de/product/police-bike-car-chase-60392,2024-10-12,False
374,Polizeimotorrad,DUPLO®,10967,2+,5.0,9.99,,,,,/de-de/product/police-motorcycle-10967,2024-10-12,False
1330,Polizeiauto,City,60312,5+,94.0,9.99,,,,,/de-de/product/police-car-60312,2024-03-28,False
1331,Verfolgungsjagd mit Polizeiauto und Muscle Car,City,60415,6+,213.0,19.99,,,,,/de-de/product/police-car-and-muscle-car-chase...,2024-03-28,False
1342,Mobiles Polizeihunde-Training,City,60369,5+,197.0,19.99,,,,,/de-de/product/mobile-police-dog-training-60369,2024-03-28,False
1346,Verfolgungsjagd mit dem Polizeimotorrad,City,60392,5+,59.0,9.99,,,,,/de-de/product/police-bike-car-chase-60392,2024-03-28,False
1533,Polizeimotorrad,DUPLO®,10967,2+,5.0,9.99,,,,,/de-de/product/police-motorcycle-10967,2024-03-28,False


### Task 2: Merchandising

Some products are just merchandising. 
Think of a way to identify these rows and add a new boolean column "merch" to `LEGO_all` that indicates whether it contains a merchandising product or not.

In [20]:
themes = LEGO_all["theme"].unique()
print(themes)


['Architecture' 'Batman™' 'Art' 'DC' 'LEGO® Icons'
 'The Botanical Collection' 'Sonstiges' 'Marvel' 'BrickHeadz'
 'LEGO® Wednesday Sets' 'Star Wars™' 'LEGO® Sonic the Hedgehog™'
 'Powered UP' 'City' 'Classic' 'Creator 3-in-1-Sets' 'Creator Expert'
 'Ich – Einfach unverbesserlich 4' 'Ideas' 'Disney™' 'DUPLO®' 'DOTS'
 'LEGO® Education' 'Spider-Man' 'Friends' 'Harry Potter™'
 'Jurassic World™' 'Animal Crossing™' 'LEGO® Avatar' 'LEGO® DREAMZzz™'
 'Fortnite' 'LEGO® Gabbys Puppenhaus' 'LEGO® Indiana Jones™'
 'LEGO® Super Mario™' 'LEGO® The Legend of Zelda™' 'Wicked' 'Minecraft®'
 'Minifiguren' 'Monkie Kid™' 'NINJAGO®' 'Technic' 'SERIOUS PLAY®'
 'Speed Champions' 'Lord of the Rings™' 'Disney Mickey and Friends'
 'LEGO® DUPLO® Peppa Wutz' 'LEGO® Originals' 'Die Eiskönigin'
 'MINDSTORMS®' 'Xtra' 'Brick Sketches™' 'Minions' 'BOOST' 'VIDIYO™'
 'LEGO® Icons™' 'Lightyear von Disney und Pixar' 'Stranger Things'
 'Trolls World Tour']


In [13]:
merch_themes = ['Minifiguren', 'Xtra', 'Brick Sketches™', 'LEGO® Originals','SERIOUS PLAY®', 'LEGO® Education']
LEGO_all["merch"] = LEGO_all["theme"].isin(merch_themes)
LEGO_all.head(10)

Unnamed: 0,name,theme,product_code,ageRange,pieceCount,price,minifigureCount,buildHeight,buildWidth,buildDepth,url,date,merch
0,Notre-Dame de Paris,Architecture,21061,18+,4383.0,229.99,,,,,/de-de/product/notre-dame-de-paris-21061,2024-10-12,False
1,Burg Himeji,Architecture,21060,18+,2125.0,159.99,,,,,/de-de/product/himeji-castle-21060,2024-10-12,False
2,Cheops-Pyramide,Architecture,21058,18+,1476.0,139.99,,,,,/de-de/product/great-pyramid-of-giza-21058,2024-10-12,False
3,Freiheitsstatue,Architecture,21042,16+,1685.0,99.99,,,,,/de-de/product/statue-of-liberty-21042,2024-10-12,False
4,Paris,Architecture,21044,12+,649.0,49.99,,,,,/de-de/product/paris-21044,2024-10-12,False
5,New York City,Architecture,21028,12+,598.0,49.99,,,,,/de-de/product/new-york-city-21028,2024-10-12,False
6,London,Architecture,21034,12+,468.0,39.99,,,,,/de-de/product/london-21034,2024-10-12,False
7,Taj Mahal,Architecture,21056,18+,2022.0,119.99,,20.0,23.0,23.0,/de-de/product/taj-mahal-21056,2024-10-12,False
8,LEGO® House,Architecture,21037,12+,774.0,49.99,,,,,/de-de/product/lego-house-21037,2024-10-12,False
9,Singapur,Architecture,21057,18+,827.0,59.99,,,,,/de-de/product/singapore-21057,2024-10-12,False


Does LEGO have more or less merchandising items in its range today than in the past?
Build a DataFrame that shows for each date the total number of merchandising products and the number of real LEGO models.

In [14]:
merch_vs_real = LEGO_all.groupby("date").apply(
    lambda x: pd.Series({
        "merch_count": x["merch"].sum(),
        "real_count": (~x["merch"]).sum()
    })
).reset_index()
merch_vs_real

  merch_vs_real = LEGO_all.groupby("date").apply(


Unnamed: 0,date,merch_count,real_count
0,2021-10-04,26,798
1,2022-03-22,28,702
2,2022-09-19,46,924
3,2023-02-28,57,931
4,2023-09-21,75,979
5,2024-03-28,81,933
6,2024-10-12,67,1154


### Task 3: Pricing policy

How often do the different prices occur in the current models?
Build a DataFrame that shows the absolute frequency of every price and sort it according to these frequencies.

Also add a column to your table in which the relative frequency (i.e. the percentage share) is shown rather than the absolute frequency.

In [16]:
price_frequency = LEGO["price"].value_counts().reset_index()
price_frequency.columns = ["price", "absolute_frequency"]

price_frequency["relative_frequency"] = (price_frequency["absolute_frequency"]/len(LEGO)) * 100
price_frequency

Unnamed: 0,price,absolute_frequency,relative_frequency
0,19.99,747,10.983679
1,9.99,582,8.557565
2,29.99,476,6.998971
3,49.99,416,6.116748
4,99.99,344,5.058080
...,...,...,...
134,50.00,1,0.014704
135,19.95,1,0.014704
136,10.00,1,0.014704
137,40.00,1,0.014704


### Insertion

So far, we have seen split-apply-combine in a relatively simple form.
In most cases, an explicit column was selected after grouping and an aggregation function was applied to it.
In principle, you can also apply any function to the respective SubDataFrames.

In [17]:
def some_stats(groupdf):
    rows = len(groupdf)
    unique_prices = groupdf["price"].nunique()
    total_price = groupdf["price"].sum()
    return pd.Series([rows, unique_prices, total_price], index= ["num_sets", "unique_prices", "total_price"])

(LEGO.groupby("theme")
    .apply(some_stats, include_groups=False)
    .sort_values("total_price", ascending = False)
    .head(5))

Unnamed: 0_level_0,num_sets,unique_prices,total_price
theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars™,611.0,57.0,53794.57
Technic,273.0,42.0,34677.73
LEGO® Icons,131.0,25.0,28558.69
Harry Potter™,385.0,46.0,27784.69
City,596.0,37.0,26408.07


### Task 4

Which models are permanently in the product range?

In [21]:
# 
permanent_models = [(LEGO_all.groupby('product_code')['date'].nunique()) > 1]
permanent_models

[product_code
 630         True
 2304       False
 10255       True
 10258      False
 10261      False
            ...  
 5008786     True
 5008815    False
 5008877    False
 5008878    False
 5008900    False
 Name: date, Length: 2285, dtype: bool]

Which models are brand new to the product range?

In [23]:
product_appearance_count = LEGO_all.groupby('product_code')['date'].nunique()
brand_new_models_by_code = product_appearance_count[product_appearance_count == 1]
brand_new_models_by_code

product_code
2304       1
10258      1
10261      1
10262      1
10264      1
          ..
5008751    1
5008815    1
5008877    1
5008878    1
5008900    1
Name: date, Length: 557, dtype: int64