In [None]:
###########################################################################

In [None]:
# Introduction

# This Jupyter notebook is part of your learning experience in the study of descriptive statistics
# You will work with a simple data set that contains employee specifics of a certain company

# In this exercise, you will perform the following tasks:
# 1 - Load and study the data
# 2 - Visualise the distributions of ratings and compensations
# 3 - Subset the data based on thresholds

In [None]:
###########################################################################

In [None]:
# Task 1 - Load and study the data

# Load the data and study its features such as:
# The number of employees
# The number of features
# The types of features

In [None]:
# Load "numpy" and "pandas" for manipulating numbers and data frames
# Load "matplotlib.pyplot" and "seaborn" for data visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in the "Employee_Specifics.csv" file as a Pandas Data Frame
# Note: Make sure the code and the data are in the same folder or specify the appropriate path
df = pd.read_csv('Employee_Specifics.csv', index_col = 0)

In [None]:
# Take a brief look at the data using ".head()"
df.head()

In [None]:
# Study the description of the data
# Note: Make sure the code and the data description are in the same folder or specify the appropriate path
with open('Employee_Specifics_Feature_Description.txt', 'r') as f:
    print(f.read())

In [None]:
# Get the dimensions of the data frame using ".shape"
df.shape

In [None]:
# Get the row names of the data frame using ".index"
df.index

In [None]:
# Get the column names of the data frame using ".columns"
df.columns

In [None]:
# Look at basic information about the data frame using ".info()"
df.info()

In [None]:
# Observations

# There are 20 rows and 4 columns in the data
# Each row contains the employee specifics of a certain employee in the company

# The features in the data set are:
# The names of the employees
# Their respective departments
# Their compensations in rupees
# Their ratings on a scale of 1 to 5

In [None]:
###########################################################################

In [None]:
# Task 2 - Visualise the distributions of ratings and compensations

# We will now visualise the distributions of employee ratings and compensations

# We will create the following plots:
# A scatter plot of employee rating and employee compensation
# A count plot of employee rating
# A histogram of employee compensation

# We will use these plots to decide rating and compensation thresholds

In [None]:
# Create a scatter plot of the "Rating" and "Compensation" features
plt.figure(figsize = (8, 4), dpi= 100)
sns.scatterplot(data = df, x = 'Rating', y = 'Compensation', alpha = 0.5, color = 'blue', edgecolor = 'linen')
plt.title('Scatter Plot of Employee Rating and Employee Compensation')
plt.xlabel('Employe Rating out of 5')
plt.ylabel('Employee Compensation in rupees');
# We observe that in general, as employee ratings increase, their compensation increases as well
# But there are quite a few employees with higher ratings who are getting lower compensations

In [None]:
# Create a count plot of the "Rating" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.countplot(data = df, x = 'Rating', alpha = 0.5, edgecolor = 'linen')
plt.title('Count Plot of Employee Rating')
plt.xlabel('Employee Rating')
plt.ylabel('Count');
# We see that most employees either have a 4 or a 5 rating, with 4 being the most frequent or typical
# It would make sense to define any rating more than 4 as high; in this case, that would be a 5 rating

In [None]:
# Create a histogram of the "Compensation" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Compensation', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 10)
plt.title('Histogram of Employee Compensation')
plt.xlabel('Employee Compensation in rupees')
plt.ylabel('Count');
# We see that the compensations are not uniform, which is not necessarily a discrepancy
# But we do observe that there are a few employees with compensations on either extremities of the histogram
# Based on the histogram, any compensation less than 7,00,000 rupees could be considered as a lower compensation

In [None]:
# Observations

# Generally, as the employee ratings increase, there is an increase in the employee compensations
# However, the employee compensation values are more spread out for higher employee ratings
# An employee rating of 5 (which is the only rating greater than 4) can be considered as a high rating
# An employee compensation less than 7,00,000 rupees can be considered as a lower compensation

In [None]:
###########################################################################

In [None]:
# Task 3 - Subset the data based on thresholds

# We will now subset the original data frame based on the following conditions:
# Employees with ratings greater than 4
# Employees with compensations less than 7,00,000 rupees
# Employees with ratings greater than 4 and with compensations less than 7,00,000 rupees

In [None]:
# Subset the original data frame to contain only the entries with:
# Employee rating greater than 4
df[df['Rating'] > 4]

In [None]:
# Subest the original data frame to contain only the entries with:
# Compensation less than 7,00,000 rupees
df[df['Compensation'] < 700000]

In [None]:
# Subest the original data frame to contain only the entries with:
# Employee rating greater than 4
# Compensation less than 7,00,000 rupees
df[(df['Rating'] > 4) & (df['Compensation'] < 700000)]

In [None]:
# Observations

# The only employee seemingly facing a discrepancy in compensation as compared to rating is Aneesha
# She has a rating of 5 but only has a compensation of 6,11,783 rupees

In [None]:
###########################################################################

In [None]:
# Conclusions

# From the given data, we can use simple visualisations to get a sense of how data are distributed
# We can conduct preliminary analyses simply by subsetting data sets using well thought out thresholds and conditions

In [None]:
###########################################################################