In [1]:
import pandas as pd
import numpy as np

# We build a Dataframe file 

In [2]:
df = pd.DataFrame({
    "Student" : ["Mike", "Jack", "Diana", "Charles", "Philipp", "Charles", "Kale", "Jack"],
    "City" : ["London", "London", "Berlin", "London", "London", "Berlin", "London", "Berlin"],
    "Age" : [20, 40, 18, 24, 37, 40, 44, 20 ],
    "Maths_Score" : [84, 80, 50, 36, 44, 24, 41, 35],
    "Science_Score" : [66, 83, 51, 35, 43, 58, 71, 65]
})

In [4]:
df_city_group = df.groupby("City")
df_city_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff0d9e0b850>

In [5]:
df_city_group.mean()

Unnamed: 0_level_0,Age,Maths_Score,Science_Score
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Berlin,26.0,36.333333,58.0
London,33.0,57.0,59.6


We use `.get_group` to extract the city from the group:

In [6]:
df_city_group.get_group("Berlin")

Unnamed: 0,Student,City,Age,Maths_Score,Science_Score
2,Diana,Berlin,18,50,51
5,Charles,Berlin,40,24,58
7,Jack,Berlin,20,35,65


It's more efficiently call the city from the group, see the time:

In [8]:
%%timeit
df[df['City']=='Berlin']

148 µs ± 912 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
%%timeit
df_city_group.get_group("Berlin")

68.1 µs ± 382 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Top 3 best and worst students

We want to know the best students in Math, so we use `.nlargest` after the dataframe file name:

In [11]:
df_top3_maths = df.nlargest(3, "Maths_Score")
df_top3_maths

Unnamed: 0,Student,City,Age,Maths_Score,Science_Score
0,Mike,London,20,84,66
1,Jack,London,40,80,83
2,Diana,Berlin,18,50,51


In [12]:
df_bottom3_maths = df.nsmallest(3, "Maths_Score")
df_bottom3_maths

Unnamed: 0,Student,City,Age,Maths_Score,Science_Score
5,Charles,Berlin,40,24,58
7,Jack,Berlin,20,35,65
3,Charles,London,24,36,35


However, it's better to use `.sort_values()` as was mentioned by other Python users:

In [13]:
%%timeit
df1 = df.sort_values(by=['Maths_Score'], ascending = False).head(3)

102 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
%%timeit
df1 = df.nlargest(3, "Maths_Score")

288 µs ± 3.65 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Comparisons

In [15]:
# 1. Comparing the DataFrame to a base value
# Selecting the columns with numerical values only
df.iloc[:,2:5].gt(50)
df.iloc[:,2:5].lt(50)

# 2. Slicing the DataFrame based on comparison
# df1 is subset of df when values in "Maths_Score" column are not equal or equal to '35'
df1 = df[df["Maths_Score"].ne(35)]
df2 = df[df["Maths_Score"].eq(35)]

# 3. Creating new column of True-False values by comparing two columns
df["Maths_Student"] = df["Maths_Score"].ge(df["Science_Score"])
df["Maths_Student_1"] = df["Science_Score"].le(df["Maths_Score"])

In [19]:
display(df1, df2, df)

Unnamed: 0,Student,City,Age,Maths_Score,Science_Score
0,Mike,London,20,84,66
1,Jack,London,40,80,83
2,Diana,Berlin,18,50,51
3,Charles,London,24,36,35
4,Philipp,London,37,44,43
5,Charles,Berlin,40,24,58
6,Kale,London,44,41,71


Unnamed: 0,Student,City,Age,Maths_Score,Science_Score
7,Jack,Berlin,20,35,65


Unnamed: 0,Student,City,Age,Maths_Score,Science_Score,Maths_Student,Maths_Student_1
0,Mike,London,20,84,66,True,True
1,Jack,London,40,80,83,False,False
2,Diana,Berlin,18,50,51,False,False
3,Charles,London,24,36,35,True,True
4,Philipp,London,37,44,43,True,True
5,Charles,Berlin,40,24,58,False,False
6,Kale,London,44,41,71,False,False
7,Jack,Berlin,20,35,65,False,False
