In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os

# Download latest version of the dataset
path = kagglehub.dataset_download("ravindrasinghrana/employeedataset")

# Assuming the dataset is a CSV file, find it in the downloaded path
# List files in the dataset path
files = os.listdir(path)
print("Files in the dataset:", files)

# Load the correct CSV file (adjust if necessary based on file name)
csv_file = os.path.join(path, 'EmployeeDataset.csv')  # Update this based on actual file name
df = pd.read_csv(csv_file)

# Display basic info about the dataset
print(df.info())
print(df.head())

# Example columns assumed in dataset
# ['Employee_ID', 'Gender', 'Race/Ethnicity', 'JobDescription', 'BasePay', 'DeploymentType', 'PayZone', 'StartDate']

# Step 1: Data Cleaning (Ensuring data is clean)
df['BasePay'] = pd.to_numeric(df['BasePay'], errors='coerce')
df = df.dropna(subset=['BasePay', 'Gender', 'Race/Ethnicity', 'JobDescription'])

# Step 2: Analyzing Pay Gaps (By Gender and Race/Ethnicity)

# Calculate average pay by gender
gender_pay_gap = df.groupby('Gender')['BasePay'].mean().reset_index()
print("Average Pay by Gender:")
print(gender_pay_gap)

# Calculate average pay by race/ethnicity
race_pay_gap = df.groupby('Race/Ethnicity')['BasePay'].mean().reset_index()
print("\nAverage Pay by Race/Ethnicity:")
print(race_pay_gap)

# Calculate average pay by job description and gender/race
job_gender_pay_gap = df.groupby(['JobDescription', 'Gender'])['BasePay'].mean().unstack().fillna(0)
job_race_pay_gap = df.groupby(['JobDescription', 'Race/Ethnicity'])['BasePay'].mean().unstack().fillna(0)

# Step 3: Visualization of Pay Gaps

# Gender Pay Gap Visualization
plt.figure(figsize=(8, 5))
sns.barplot(x='Gender', y='BasePay', data=gender_pay_gap)
plt.title('Average Pay by Gender')
plt.ylabel('Average Base Pay')
plt.xlabel('Gender')
plt.show()

# Race/Ethnicity Pay Gap Visualization
plt.figure(figsize=(10, 5))
sns.barplot(x='Race/Ethnicity', y='BasePay', data=race_pay_gap)
plt.title('Average Pay by Race/Ethnicity')
plt.ylabel('Average Base Pay')
plt.xlabel('Race/Ethnicity')
plt.xticks(rotation=45)
plt.show()

# Step 4: Identify Underpaid Employees Based on Job Category

# Calculate the median pay per job description to identify outliers
job_median_pay = df.groupby('JobDescription')['BasePay'].median().reset_index()
job_median_pay.columns = ['JobDescription', 'MedianPay']

# Merge median pay with the original dataframe
df = pd.merge(df, job_median_pay, on='JobDescription')

# Consider underpaid employees as those earning less than 80% of the median pay for their role
underpaid_employees = df[df['BasePay'] < 0.8 * df['MedianPay']]

# List of underpaid employees
underpaid_list = underpaid_employees[['Employee_ID', 'JobDescription', 'Gender', 'Race/Ethnicity', 'BasePay', 'MedianPay']]
print("\nList of Underpaid Employees:")
print(underpaid_list)

# Step 5: Optional Visualization of Underpaid Employees
plt.figure(figsize=(12, 6))
sns.scatterplot(data=underpaid_employees, x='JobDescription', y='BasePay', hue='Gender', size='BasePay', sizes=(40, 400))
plt.xticks(rotation=90)
plt.title('Underpaid Employees by Job Description and Gender')
plt.ylabel('Base Pay')
plt.show()

# Saving the underpaid employees list to a CSV (optional)
underpaid_list.to_csv('underpaid_employees.csv', index=False)



Files in the dataset: ['employee_data.csv', 'employee_engagement_survey_data.csv', 'recruitment_data.csv', 'training_and_development_data.csv']


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\prian\\.cache\\kagglehub\\datasets\\ravindrasinghrana\\employeedataset\\versions\\2\\EmployeeDataset.csv'

In [9]:
pip install kagglehub


Note: you may need to restart the kernel to use updated packages.


