In [None]:
###########################################################################

In [None]:
# Introduction

# This Jupyter notebook is part of your learning experience in the study of quantiles
# You will work with a simple data set that contains the scores of job applicants in an entrance test

# In this exercise, you will perform the following tasks:
# 1 - Load and study the data
# 2 - View the distributions of the various features in the data set and break them into categories using quantiles
# 3 - Subset the data using the quantiles decided to separate the applicants into three categories

In [None]:
###########################################################################

In [None]:
# Task 1 - Load and study the data

# Load the data and study its features such as:
# The number of applicants
# The number of features
# The types of features

In [None]:
# Load "numpy" and "pandas" for manipulating numbers and data frames
# Load "matplotlib.pyplot" and "seaborn" for data visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in the "Interview_First_Stage_Exam.csv" file as a Pandas Data Frame
# Note: Make sure the code and the data are in the same folder or specify the appropriate path
df = pd.read_csv('Interview_First_Stage_Exam.csv', index_col = 0)

In [None]:
# Take a brief look at the data using ".head()"
df.head()

In [None]:
# Study the description of the data
# Note: Make sure the code and the data description are in the same folder or specify the appropriate path
with open('Interview_First_Stage_Exam_Feature_Description.txt', 'r') as f:
    print(f.read())

In [None]:
# Get the dimensions of the data frame using ".shape"
df.shape

In [None]:
# Get the row names of the data frame using ".index"
df.index

In [None]:
# Get the column names of the data frame using ".columns"
df.columns

In [None]:
# Look at basic information about the data frame using ".info()"
df.info()

In [None]:
# Observations

# There are 50 rows and 4 columns in the data
# Each row contains the marks obtained out of 100 by a specific applicant in English, Mathematics and Programming

# The features in the data set are:
# The names of the applicants
# Their marks obtained out of 100 in English
# Their marks obtained out of 100 in mathematics
# Their marks obtained out of 100 in programming

In [None]:
###########################################################################

In [None]:
# Task 2 - View the distributions of the various features in the data set and break them into categories using quantiles

# We will now look at the distributions of the marks obtained by the applicants in the three subjects
# We will also use quantiles to divide the distribution into groups

In [None]:
# Calculate the median "English" feature using ".median()"
df['English'].median()

In [None]:
# Create a histogram of the "English" feature and also show the median
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'English', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['English'].median(), ymin = 0, ymax = 8, colors = 'red')
plt.title('Histogram of Marks Obtained in English')
plt.xlabel('Marks Obtained in English out of 100')
plt.ylabel('Count');

In [None]:
# Calculate the median "English" feature using ".quantile()"
# Note: The median is the 0.5th quantile, so pass in 0.5 as a parameter to the ".quantile()" function
df['English'].quantile(0.5)

In [None]:
# Calculate the quartiles of the "English" feature using ".quantile()"
# Note: The three quartiles are equal to the 0.25th, 0.5th and 0.75th quantiles
# Note: You can pass a list of numbers to the ".quantile()" function
df['English'].quantile([0.25, 0.5, 0.75])

In [None]:
# Create a histogram of the "English" feature and also show the quartiles
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'English', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['English'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 8, colors = 'red')
plt.title('Histogram of Marks Obtained in English')
plt.xlabel('Marks Obtained in English out of 100')
plt.ylabel('Count');

In [None]:
# Create a box plot of the "English" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.boxplot(data = df, x = 'English', color = 'linen')
plt.title('Box Plot of Marks Obtained in English')
plt.xlabel('Marks Obtained in English out of 100');

In [None]:
# Calculate the quartiles of the "Mathematics" feature using ".quantile()"
# Note: The three quartiles are equal to the 0.25th, 0.5th and 0.75th quantiles
# Note: You can pass a list of numbers to the ".quantile()" function
df['Mathematics'].quantile([0.25, 0.5, 0.75])

In [None]:
# Create a histogram of the "Mathematics" feature and also show the quartiles
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Mathematics', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['Mathematics'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 8, colors = 'red')
plt.title('Histogram of Marks Obtained in Mathematics')
plt.xlabel('Marks Obtained in Mathematics out of 100')
plt.ylabel('Count');

In [None]:
# Create a box plot of the "Mathematics" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.boxplot(data = df, x = 'Mathematics', color = 'linen')
plt.title('Box Plot of Marks Obtained in Mathematics')
plt.xlabel('Marks Obtained in Mathematics out of 100');

In [None]:
# Calculate the quartiles of the "Programming" feature using ".quantile()"
# Note: The three quartiles are equal to the 0.25th, 0.5th and 0.75th quantiles
# Note: You can pass a list of numbers to the ".quantile()" function
df['Programming'].quantile([0.25, 0.5, 0.75])

In [None]:
# Create a histogram of the "Programming" feature and also show the quartiles
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Programming', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['Programming'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 10, colors = 'red')
plt.title('Histogram of Marks Obtained in Programming')
plt.xlabel('Marks Obtained in Programming out of 100')
plt.ylabel('Count');

In [None]:
# Create a box plot of the "Programming" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.boxplot(data = df, x = 'Programming', color = 'linen')
plt.title('Box Plot of Marks Obtained in Programming')
plt.xlabel('Marks Obtained in Programming out of 100');

In [None]:
# Create a figure with all the relevant plots
plt.figure(figsize = (12, 6))

plt.subplot(3, 2, 1)
sns.histplot(data = df, x = 'English', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['English'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 8, colors = 'red')
plt.title('English')
plt.xlabel('')
plt.ylabel('')

plt.subplot(3, 2, 2)
sns.boxplot(data = df, x = 'English', color = 'linen')
plt.title('English')
plt.xlabel('')

plt.subplot(3, 2, 3)
sns.histplot(data = df, x = 'Mathematics', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['Mathematics'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 8, colors = 'red')
plt.title('Mathematics')
plt.xlabel('')
plt.ylabel('')

plt.subplot(3, 2, 4)
sns.boxplot(data = df, x = 'Mathematics', color = 'linen')
plt.title('Mathematics')
plt.xlabel('')

plt.subplot(3, 2, 5)
sns.histplot(data = df, x = 'Programming', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.vlines(x = df['Programming'].quantile([0.25, 0.5, 0.75]), ymin = 0, ymax = 10, colors = 'red')
plt.title('Programming')
plt.xlabel('')
plt.ylabel('')

plt.subplot(3, 2, 6)
sns.boxplot(data = df, x = 'Programming', color = 'linen')
plt.title('Programming')
plt.xlabel('')

plt.tight_layout();

In [None]:
# Observations

# By observing the distance between the lower and upper quartiles of a distribution, we get a sense of its range
# The spread of marks in the mathematics test is the greatest and it is least for the programming test
# The lower and upper quartiles for English are 42.5 and 76.5
# The lower and upper quartiles for mathematics are 37.25 and 77.25
# The lower and upper quartiles for programming are 49 and 75

In [None]:
###########################################################################

In [None]:
# Task 3 - Subset the data using the quantiles decided to separate the applicants into three categories

# We will now use the lower and upper quartiles as thresholds for the interview process
# We will store the lower and upper cut-off marks in a data frame

In [None]:
# Create a Pandas Series called "lower" to store the lower cut-off marks for all the subjects
# Note: Set the "index" parameter of the series to the list of the column names of the data frame except the first column
# Note: The "data" parameter of the series needs to be a list of the relevant values
lower = pd.Series(index = df.columns[1:],
                  data = [df['English'].quantile(0.25), df['Mathematics'].quantile(0.25), df['Programming'].quantile(0.25)])

In [None]:
# Print the "lower" series
lower

In [None]:
# Create a Pandas Series called "upper" to store the upper cut-off marks for all the subjects
# Note: Set the "index" parameter of the series to the list of the column names of the data frame except the first column
# Note: The "data" parameter of the series needs to be a list of the relevant values
upper = pd.Series(index = df.columns[1:],
                  data = [df['English'].quantile(0.75), df['Mathematics'].quantile(0.75), df['Programming'].quantile(0.75)])

In [None]:
# Print the "upper" series
upper

In [None]:
# Combine the "lower" and "upper" series into a single data frame called "cutoff"
# Note: Set the "index" parameter of the data frame to the list of the column names of the data frame except the first column
# Note: The "data" parameter must be set in the form of a Python Dictionary using the series "lower" and "upper"
cutoff = pd.DataFrame(index = df.columns[1:], data = {'Lower' : lower, 'Upper' : upper})

In [None]:
# Print the "cutoff" data frame
cutoff

In [None]:
# Use the upper cut-off values to get the details of applicants who will directly go to the final stage of the interview
# Note: A candidate must have scored greater than or equal to the upper cut-off marks for each of the subjects
# Store the resulting subsetted data frame as "final"
final = df[(df['English'] >= 76.5) & (df['Mathematics'] >= 77.25) & (df['Programming'] >= 75)]

In [None]:
# Print the "Name" feature of the "final" data frame
# These are the specifics of the candidates who will directly proceed to the final stage of the interview
final['Name']

In [None]:
# Use the lower and upper cut-off values to get the details of applicants who will go to the second stage of the interview
# Note: A candidate must have scored greater than or equal to the lower cut-off marks for each of the subjects
# Note: The candidate must have scored less than the upper cut-off marks for each of the subjects
# Store the resulting subsetted data frame as "second"
second = df[(df['English'] >= 42.5) & (df['English'] < 76.5) &
            (df['Mathematics'] >= 37.25) & (df['Mathematics'] < 77.25) &
            (df['Programming'] >= 49) & (df['Programming'] < 75)]

In [None]:
# Print the "Name" feature of the "second" data frame
# These are the specifics of the candidates who will proceed to the second stage of the interview
second['Name']

In [None]:
# Use the lower cut-off values to get the details of applicants who will be rejected this time and need to apply again
# Note: A candidate must have scored less than the upper cut-off marks for each of the subjects
# Store the resulting subsetted data frame as "reject"
reject = df[(df['English'] < 42.5) & (df['Mathematics'] < 37.25) & (df['Programming'] < 49)]

In [None]:
# Print the "Name" feature of the "reject" data frame
# These are the specifics of the candidates whol will need to apply again another time
reject['Name']

In [None]:
# Observations

# We can derive quantile values from a given distribution
# The quantile values are wholly dependent on the given distribution
# The quartiles help us divide the distributions into four equal parts
# We can use the lower and upper quartiles as thresholds for the interview process

In [None]:
###########################################################################

In [None]:
# Conclusions

# From the given data, we can use simple visualisations to get a sense of how data are distributed
# We can use various quantile measures to break the distributions into equal parts
# We can use the first and the third quartiles as lower and upper cut-offs for the interview process

In [None]:
###########################################################################