# Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Load and describe data

In [4]:
# Load the data
file_path = '../../../data/2023-11-27HiBob Export_Share.xlsx'  # Replace with your file path
df = pd.read_excel(file_path, sheet_name='2023-11-27HiBob Export')

In [5]:
df.head()

Unnamed: 0,Employee ID,Office,Employment Status,Manager's ID,Division,Function,Department,Team,Job title,Original start date,Termination date,Date of Change,Salary OTE (USD),Dec 2023 Process,Future Process
0,1,IL,Full time,157,GandA,Finance,,,Finance Operations,2007-08-26,NaT,2023-05-01 08:35:00,77260.0,,
1,2,IL,Full time,345,RandD,R&D,,,Principal Architect,2011-08-21,NaT,2023-11-12 07:33:00,216986.0,,
2,3,IL,Full time,,,,,,,2012-07-01,2022-10-27,2023-03-13 16:52:00,213699.0,,
3,4,USA,Full time,235,Revenue,CS,CS,US,Principal CSM,2012-07-01,NaT,2023-07-31 05:46:00,250000.0,,
4,5,IL,Full time,A370,RandD,R&D,Linux & Core Acceleration,,Senior Linux Developer,2013-11-03,NaT,2023-11-12 07:30:00,131507.0,,


In [6]:
nan_distribution = df.isnull().sum()
nan_distribution

Employee ID              0
Office                   0
Employment Status        0
Manager's ID           131
Division               127
Function               127
Department             204
Team                   220
Job title              127
Original start date      0
Termination date       211
Date of Change           0
Salary OTE (USD)         1
Dec 2023 Process       328
Future Process         371
dtype: int64

# Cleaning

In [7]:
# Data Cleaning
df['Original start date'] = pd.to_datetime(df['Original start date'])
df['Termination date'] = pd.to_datetime(df['Termination date'])
df['Year of Start'] = df['Original start date'].dt.year
df['Year of Termination'] = df['Termination date'].dt.year
df['Function'] = df['Function'].fillna('Missing')
df['Division'] = df['Division'].fillna('Missing')
df['Department'] = df['Department'].fillna('Missing')

In [8]:
df.head()

Unnamed: 0,Employee ID,Office,Employment Status,Manager's ID,Division,Function,Department,Team,Job title,Original start date,Termination date,Date of Change,Salary OTE (USD),Dec 2023 Process,Future Process,Year of Start,Year of Termination
0,1,IL,Full time,157,GandA,Finance,Missing,,Finance Operations,2007-08-26,NaT,2023-05-01 08:35:00,77260.0,,,2007,
1,2,IL,Full time,345,RandD,R&D,Missing,,Principal Architect,2011-08-21,NaT,2023-11-12 07:33:00,216986.0,,,2011,
2,3,IL,Full time,,Missing,Missing,Missing,,,2012-07-01,2022-10-27,2023-03-13 16:52:00,213699.0,,,2012,2022.0
3,4,USA,Full time,235,Revenue,CS,CS,US,Principal CSM,2012-07-01,NaT,2023-07-31 05:46:00,250000.0,,,2012,
4,5,IL,Full time,A370,RandD,R&D,Linux & Core Acceleration,,Senior Linux Developer,2013-11-03,NaT,2023-11-12 07:30:00,131507.0,,,2013,


# Headcount and Attrition analysis

In [9]:
# Headcount Analysis
years = range(df['Year of Start'].min(), df['Year of Start'].max() + 1)
headcount_over_years = {}
for year in years:
    count = df[(df['Year of Start'] <= year) & (df['Year of Termination'].isna() | (df['Year of Termination'] >= year))].shape[0]
    headcount_over_years[year] = count

# Plotting the Headcount Over Years with Plotly
fig_headcount = go.Figure(go.Bar(
    x=list(headcount_over_years.keys()),
    y=list(headcount_over_years.values()),
    marker_color='blue'
))
fig_headcount.update_layout(
    title='Headcount Over Years',
    xaxis_title='Year',
    yaxis_title='Number of Employees',
    template='plotly_white'
)
fig_headcount.show()

In [10]:
# Attrition Analysis
attrition_rates = {}
for year in years:
    total_employees_start_of_year = headcount_over_years.get(year, 0)
    terminations_this_year = df[df['Year of Termination'] == year].shape[0]
    if total_employees_start_of_year > 0:
        attrition_rate = terminations_this_year / total_employees_start_of_year
    else:
        attrition_rate = 0
    attrition_rates[year] = attrition_rate

# Plotting the Attrition Rates Over Years with Plotly
fig_attrition = go.Figure(go.Bar(
    x=list(attrition_rates.keys()),
    y=list(attrition_rates.values()),
    marker_color='red'
))
fig_attrition.update_layout(
    title='Attrition Rates Over Years',
    xaxis_title='Year',
    yaxis_title='Attrition Rate',
    template='plotly_white'
)
fig_attrition.show()

# Salary Distribution

In [11]:
# Salary Analysis using Plotly
fig_salary = px.histogram(
    df, 
    x='Salary OTE (USD)', 
    nbins=30,  # Adjust the number of bins as needed
    title='Salary Distribution'
)
fig_salary.update_layout(
    xaxis_title='Salary OTE (USD)',
    yaxis_title='Frequency',
    template='plotly_white'
)
fig_salary.show()

# Function Distribution analysis per year

In [13]:
# Function to create distribution data
def create_distribution_data(category, percentage_mode=False):
    years = range(df['Year of Start'].min(), df['Year of Start'].max() + 1)
    distribution_df = pd.DataFrame(index=years)
    
    for item in df[category].unique():
        if item == 'Missing' and percentage_mode:
            continue  # Skip 'Missing' if in percentage mode

        distribution = []
        for year in years:
            if percentage_mode:
                total_count = df[(df[category] != 'Missing') &
                                 (df['Year of Start'] <= year) &
                                 ((df['Year of Termination'].isna()) | (df['Year of Termination'] >= year))].shape[0]
            else:
                total_count = df[(df['Year of Start'] <= year) &
                                 ((df['Year of Termination'].isna()) | (df['Year of Termination'] >= year))].shape[0]

            category_count = df[(df[category] == item) &
                                (df['Year of Start'] <= year) &
                                ((df['Year of Termination'].isna()) | (df['Year of Termination'] >= year))].shape[0]

            value = (category_count / total_count) * 100 if percentage_mode and total_count > 0 else category_count
            distribution.append(value)
        
        distribution_df[item] = distribution
    
    return distribution_df

# Function to create Plotly figure
def create_plotly_stacked_bar(distribution_df, title, percentage_mode=False):
    fig = go.Figure()
    for column in distribution_df.columns:
        fig.add_trace(go.Bar(
            x=distribution_df.index,
            y=distribution_df[column],
            name=column
        ))

    y_axis_title = 'Percentage of Total Headcount' if percentage_mode else 'Number of Employees'
    
    fig.update_layout(
        barmode='stack',
        title=title,
        xaxis_title='Year',
        yaxis_title=y_axis_title,
        legend_title=title
    )

    fig.show()

# User can toggle this variable to switch between modes

# Create and Plot Distribution DataFrames
function_distribution_df_percentage = create_distribution_data('Function', percentage_mode=True)
function_distribution_df = create_distribution_data('Function')
division_distribution_df = create_distribution_data('Division')
department_distribution_df = create_distribution_data('Department')

create_plotly_stacked_bar(function_distribution_df_percentage, 'Percentage of headcount per Function Distribution', percentage_mode=True)
create_plotly_stacked_bar(function_distribution_df, 'Headcount per Function Distribution')
create_plotly_stacked_bar(division_distribution_df, 'Headcount per Division Distribution')
create_plotly_stacked_bar(department_distribution_df, 'Headcount per Department Distribution')