In [35]:
# Part 1 - Loading and Initial Exploration
# 1- Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 2- Load the Dataset
# Load the CSV file
df = pd.read_csv('data.csv')

# Display the first 10 rows
print(df.head(10))

# Display the last 5 rows
print(df.tail(5))

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 3- Dataset Overview
# Check the shape of the dataset (number of rows and columns)
print(df.shape)

# Display all column names
print(df.columns.tolist())

# Check the data types of each column
print(df.dtypes)

#Use .info() to see a summary 
print(df.info())

# Generate descriptive statistics using .describe()
print(df.describe())
print(df.describe(include='object'))

# How many employees are in this dataset?
print(df.shape[0])

# How many columns do we have?
print(df.shape[1])

# How many numerical columns vs categorical columns?
print(df.select_dtypes(include=[np.number]).shape[1])
print(df.select_dtypes(include=['object']).shape[1])

# Are there any missing values?
print(df.isnull().sum())

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 4- Unique Values Exploration
# List all unique values in the Department column
print(df['Department'].unique())

# Count how many unique departments exist using .nunique()
print(df['Department'].nunique())

# Display the frequency count of each department using .value_counts()
print(df['Department'].value_counts())

# Display the frequency count for Attrition (Yes/No)
print(df['Attrition'].value_counts())

# Display the frequency count for JobRole
print(df['JobRole'].value_counts())

# Display the frequency count for Gender
print(df['Gender'].value_counts())

# Which department has the most employees?
print(df['Department'].value_counts().idxmax())

# How many employees left the company (Attrition = Yes)?
print(df['Attrition'].value_counts().get('Yes', 0))
print(df[df['Attrition'] == 'Yes'].shape[0])

# What is the gender distribution?
print(df['Gender'].value_counts(normalize=True))

# Which job role is most common?
print(df['JobRole'].value_counts().idxmax())

-------------------------------------------------------------------------------------------------------------------------------------------
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   
5   32        No  Travel_Frequently       1005  Research & Development   
6   59        No      Travel_Rarely       1324  Research & Development   
7   30        No      Travel_Rarely       1358  Research & Development   
8   38        No  Travel_Frequently        216  Research & Development   
9   36        No      Travel_Rarely       1299  Research & Development   

   DistanceFromHome  Education EducationField

In [36]:
# Part 2 - Descriptive Statistics
# 1- Central Tendency - Age Analysis
# Calculate the mean and median age. Find the mode of age
mean_age = df['Age'].mean()
median_age = df['Age'].median()
mode_age = df['Age'].mode()[0]

# Display all three values together
print(df['Age'].agg(['mean', 'median', lambda x: x.mode()[0]]).rename(index={'<lambda>': 'mode'}))

# What is the average age of employees?
print(mean_age)

# Is the mean higher or lower than the median age?
if mean_age > median_age:
    print("Mean age is higher than median age.")
elif mean_age < median_age:
    print("Mean age is lower than median age.")
else:
    print("Mean age is equal to median age.")

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 2- Central Tendency
# Calculate mean, median for MonthlyIncome, YearsAtCompany and TotalWorkingYears
print(df[['MonthlyIncome', 'YearsAtCompany', 'TotalWorkingYears']].agg(['mean', 'median']))

# Create a summary table showing these statistics
summary_table = df[['MonthlyIncome', 'YearsAtCompany', 'TotalWorkingYears']].agg(['mean', 'median'])
print(summary_table)

# What is the average monthly income?
print("Average Monthly Income:", summary_table.loc['mean', 'MonthlyIncome'])

# What is the median years at company?
print("Median Years at Company:", summary_table.loc['median', 'YearsAtCompany'])

# Compare mean vs median for income - what does the difference tell you?
mean_income = summary_table.loc['mean', 'MonthlyIncome']
median_income = summary_table.loc['median', 'MonthlyIncome']
if mean_income > median_income:
    print("Mean income is higher than median income.")
elif mean_income < median_income:
    print("Mean income is lower than median income.")
else:
    print("Mean income is equal to median income.")

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 3- Dispersion Measures - Measure how spread out monthly income values are.
# Calculate the minimum and maximum monthly income
min_income = df['MonthlyIncome'].min()
max_income = df['MonthlyIncome'].max()
print("Minimum Monthly Income:", min_income)
print("Maximum Monthly Income:", max_income)

# Calculate the range (max - min)
print("Income Range:", max_income - min_income)

# Calculate the standard deviation
std_income = df['MonthlyIncome'].std()
print("Standard Deviation of Monthly Income:", std_income)

# Calculate the variance
var_income = df['MonthlyIncome'].var()
print("Variance of Monthly Income:", var_income)

# Calculate Q1 (25th percentile), Q2 (50th percentile/median), Q3 (75th percentile)
Q1 = df['MonthlyIncome'].quantile(0.25)
Q2 = df['MonthlyIncome'].quantile(0.50)
Q3 = df['MonthlyIncome'].quantile(0.75)
print("Q1 (25th percentile):", Q1)
print("Q2 (50th percentile/median):", Q2)
print("Q3 (75th percentile):", Q3)

# Calculate the Interquartile Range (IQR = Q3 - Q1)
IQR = Q3 - Q1
print("Interquartile Range (IQR):", IQR)

# What is the income range?
# The income range is the difference between the maximum and minimum monthly income.

# Is there high variability in incomes? (look at std deviation relative to mean)
# If the standard deviation is large relative to the mean, it indicates high variability in incomes.
if std_income / mean_income > 0.5:
    print("There is high variability in incomes.")
else:
    print("There is low variability in incomes.")

# What income value separates the bottom 25% from the rest?
# The income value that separates the bottom 25% from the rest is Q1 (25th percentile).

# What does the IQR tell you about the middle 50% of incomes?
# The IQR indicates the range within which the middle 50% of incomes fall.

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 4- Comparing Groups - Compare statistics across different departments.
# Group data by Department
grouped_dept = df.groupby('Department')

# Calculate the mean MonthlyIncome for each department
mean_income_dept = grouped_dept['MonthlyIncome'].mean()

# Calculate the mean Age for each department
mean_age_dept = grouped_dept['Age'].mean()

# Calculate the mean YearsAtCompany for each department
mean_years_dept = grouped_dept['YearsAtCompany'].mean()

# Count the number of employees in each department
count_dept = grouped_dept.size()

# Display these results in a clear DataFrame
dept_summary = pd.DataFrame({
    'Mean Monthly Income': mean_income_dept,
    'Mean Age': mean_age_dept,
    'Mean Years at Company': mean_years_dept,
    'Employee Count': count_dept
})
# Which department has the highest average monthly income?
highest_income_dept = dept_summary['Mean Monthly Income'].idxmax()

# Which department has the youngest employees on average?
youngest_age_dept = dept_summary['Mean Age'].idxmin()

# Which department has employees who stay longest at the company?
longest_stay_dept = dept_summary['Mean Years at Company'].idxmax()

print("-------------------------------------------------------------------------------------------------------------------------------------------")

# 5- Comparing Groups - Compare employees who left vs stayed.
# Group data by Attrition (Yes/No)
grouped_attrition = df.groupby('Attrition')

# Calculate mean MonthlyIncome for each group
mean_income_attrition = grouped_attrition['MonthlyIncome'].mean()

# Calculate mean Age for each group
mean_age_attrition = grouped_attrition['Age'].mean()

# Calculate mean YearsAtCompany for each group
mean_years_attrition = grouped_attrition['YearsAtCompany'].mean()

# Calculate mean JobSatisfaction for each group
mean_job_satisfaction_attrition = grouped_attrition['JobSatisfaction'].mean()

# Calculate mean WorkLifeBalance for each group
mean_work_life_balance_attrition = grouped_attrition['WorkLifeBalance'].mean()

# Do employees who left have lower income on average?
if mean_income_attrition['Yes'] < mean_income_attrition['No']:
    print("Employees who left have lower income on average.")
else:
    print("Employees who left do not have lower income on average.")

# Are younger or older employees more likely to leave?
if mean_age_attrition['Yes'] < mean_age_attrition['No']:
    print("Younger employees are more likely to leave.")
else:
    print("Older employees are more likely to leave.")
    
# Is there a difference in job satisfaction between those who left and stayed?
# Employees who left have lower job satisfaction on average.
if mean_job_satisfaction_attrition['Yes'] < mean_job_satisfaction_attrition['No']:
    print("Employees who left have lower job satisfaction on average.")
else:
    print("Employees who left do not have lower job satisfaction on average.")

# What patterns do you notice?
# Employees who left tend to have lower income, are younger, and have lower job satisfaction on average.

mean      36.92381
median    36.00000
mode      35.00000
Name: Age, dtype: float64
36.923809523809524
Mean age is higher than median age.
-------------------------------------------------------------------------------------------------------------------------------------------
        MonthlyIncome  YearsAtCompany  TotalWorkingYears
mean      6502.931293        7.008163          11.279592
median    4919.000000        5.000000          10.000000
        MonthlyIncome  YearsAtCompany  TotalWorkingYears
mean      6502.931293        7.008163          11.279592
median    4919.000000        5.000000          10.000000
Average Monthly Income: 6502.931292517007
Median Years at Company: 5.0
Mean income is higher than median income.
-------------------------------------------------------------------------------------------------------------------------------------------
Minimum Monthly Income: 1009
Maximum Monthly Income: 19999
Income Range: 18990
Standard Deviation of Monthly Income: 4707.95678