# HR Onboarding Data Analysis – Python Notebook
This notebook performs data cleaning and exploratory analysis on the HR onboarding dataset.

In [None]:
# 1. Load Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 2. Load Dataset
df = pd.read_excel('realistic_hr_onboarding_data.xlsx')

In [None]:
# 3. Preview Data
df.head()

In [None]:
# 4. Data Info & Summary
df.info()
df.describe()

In [None]:
# 5. Check for Missing Values and Duplicates
print("Missing values:\n", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

In [None]:
# 6. Clean Column Names (if needed)
df.columns = df.columns.str.strip().str.replace(' ', '_')

In [None]:
# 7. Visualize Processing Time Distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['Processing_Time_(min)'], bins=30, kde=True)
plt.title('Distribution of Processing Time')
plt.xlabel('Time (minutes)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 8. Step Completion Rates
step_cols = [
    'Offer_Letter_Status', 'Documents_Status', 'References_Status', 'Policies_Status',
    'Vetting_Status', 'Contract_Status', 'Folder_Status', 'Final_Email_Status'
]
for col in step_cols:
    print(f"\n{col} Completion Rate:")
    print(df[col].value_counts(normalize=True))

In [None]:
# 9. Average Processing Time by Office
avg_time_office = df.groupby('Office')['Processing_Time_(min)'].mean().sort_values()
print("Average Processing Time by Office:\n", avg_time_office)

In [None]:
# 10. Correlation Between Delayed Steps (Heatmap)
delay_df = df[step_cols].apply(lambda x: x == 'Delayed').astype(int)
plt.figure(figsize=(8, 6))
sns.heatmap(delay_df.corr(), annot=True, cmap='Reds')
plt.title('Correlation Between Delays in Onboarding Steps')
plt.show()

In [None]:
# 11. Export Cleaned Data (Optional)
df.to_csv('cleaned_hr_onboarding_data.csv', index=False)