# Attrition using Linkedin ODP data

In [1]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

# Load the dataset
data = pd.read_csv('../../../data/Incredibuild/HRIS/001_INCREDIBUILD_ALL_PROFILES.csv', delimiter=';')
data_companies = pd.read_csv('../../../data/Incredibuild/HRIS/001_ALL_PROFILES.csv', delimiter=';')


In [41]:
# Data Preprocessing
# Converting 'start_date' and 'end_date' to datetime format
data['start_date'] = pd.to_datetime(data['start_date'], format='%d/%m/%Y')
data['end_date'] = pd.to_datetime(data['end_date'], format='%d/%m/%Y', errors='coerce')

# Filling NaN values in 'end_date' with a placeholder far future date for ongoing employment
data['end_date'].fillna(pd.Timestamp('2025-01-01'), inplace=True)

# Display the updated dataframe
data[['profile_id', 'start_date', 'end_date']].head()


Unnamed: 0,profile_id,start_date,end_date
0,cf706fc6-0ad7-44a2-b075-46436902b541,2022-03-01,2025-01-01
1,714ec618-c843-4cc0-a3b6-bb3eba82b8df,2020-09-01,2022-02-01
2,14cf16f1-caf6-4c0d-b26f-ebb4879aab3f,2017-11-01,2021-01-01
3,d3aa4910-3740-42b2-9740-090bdff10c51,2023-01-01,2025-01-01
4,5558f1f2-91e6-4527-9f9b-a1bca25e429b,2015-09-01,2017-08-01


In [44]:
# Extracting year, month, and quarter from dates
data['Start Year'] = data['start_date'].dt.year
data['Start Month'] = data['start_date'].dt.to_period('M')
data['Start Quarter'] = data['start_date'].dt.to_period('Q')
data['Termination Year'] = data['end_date'].dt.year
data['Termination Month'] = data['end_date'].dt.to_period('M')
data['Termination Quarter'] = data['end_date'].dt.to_period('Q')

# Function to compute headcount and attrition rates
def compute_headcount_and_attrition(data, period):
    time_periods = sorted(data[f'Start {period}'].unique())
    headcount = {}
    attrition_rates = {}

    for tp in time_periods:
        current_employees = data[(data[f'Start {period}'] <= tp) & (data[f'Termination {period}'].isna() | (data[f'Termination {period}'] >= tp))]
        headcount[tp] = current_employees.shape[0]
        terminations = current_employees[current_employees[f'Termination {period}'] == tp].shape[0]
        attrition_rates[tp] = terminations / headcount[tp] if headcount[tp] > 0 else 0

    return headcount, attrition_rates

# Compute headcount and attrition rates
monthly_headcount, monthly_attrition = compute_headcount_and_attrition(data, 'Month')
quarterly_headcount, quarterly_attrition = compute_headcount_and_attrition(data, 'Quarter')
yearly_headcount, yearly_attrition = compute_headcount_and_attrition(data, 'Year')

# Function to plot headcount and attrition rates
def plot_data(headcount, attrition_rates, title, period):
    # Convert Period objects to strings for Plotly compatibility
    headcount_keys = [str(key) for key in headcount.keys()]
    attrition_rate_keys = [str(key) for key in attrition_rates.keys()]

    fig = go.Figure()
    fig.add_trace(go.Bar(x=headcount_keys, y=list(headcount.values()), name='Headcount', marker_color='blue'))
    fig.add_trace(go.Bar(x=attrition_rate_keys, y=list(attrition_rates.values()), name='Attrition Rate', marker_color='red'))
    
    fig.update_layout(
        title=title,
        xaxis_title=f'{period} Period',
        yaxis_title='Count / Rate',
        template='plotly_white',
        barmode='group'
    )
    fig.show()

# Plotting the data
plot_data(monthly_headcount, monthly_attrition, 'Monthly Headcount and Attrition Rates', 'Month')
plot_data(quarterly_headcount, quarterly_attrition, 'Quarterly Headcount and Attrition Rates', 'Quarter')
plot_data(yearly_headcount, yearly_attrition, 'Yearly Headcount and Attrition Rates', 'Year')


In [51]:
!python3 -m pip install xlsxwriter

# Convert the dictionaries to DataFrames
monthly_data = pd.DataFrame({'Headcount': monthly_headcount, 'Attrition Rate': monthly_attrition})
quarterly_data = pd.DataFrame({'Headcount': quarterly_headcount, 'Attrition Rate': quarterly_attrition})
yearly_data = pd.DataFrame({'Headcount': yearly_headcount, 'Attrition Rate': yearly_attrition})

# Convert PeriodIndex to string for Excel compatibility
monthly_data.index = monthly_data.index.astype(str)
quarterly_data.index = quarterly_data.index.astype(str)
yearly_data.index = yearly_data.index.astype(str)

# Create a Pandas Excel writer using XlsxWriter as the engine
excel_file_path = '../../../data/Incredibuild/HRIS/attrition_and_headcount_data.xlsx'
writer = pd.ExcelWriter(excel_file_path, engine='xlsxwriter')

# Write each DataFrame to a different worksheet
monthly_data.to_excel(writer, sheet_name='Monthly Data')
quarterly_data.to_excel(writer, sheet_name='Quarterly Data')
yearly_data.to_excel(writer, sheet_name='Yearly Data')

# Close the Pandas Excel writer and output the Excel file
writer.close()  # Correct method to save and close the writer

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://ariel_cohen_codar%40mckinsey.com:****@mckinsey.jfrog.io/artifactory/api/pypi/python/simple
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# For all companies

In [7]:
# Data Preprocessing
# Converting 'start_date' and 'end_date' to datetime format
data_companies['start_date'] = pd.to_datetime(data_companies['start_date'], format='%d/%m/%Y', errors='coerce')
data_companies['end_date'] = pd.to_datetime(data_companies['end_date'], format='%d/%m/%Y', errors='coerce')

# Filling NaN values in 'end_date' with a placeholder far future date for ongoing employment
data_companies['end_date'].fillna(pd.Timestamp('2025-01-01'), inplace=True)



# Create a Pandas Excel writer
excel_file_path = '../../../data/Incredibuild/HRIS/attrition_and_headcount_data_all_companies.xlsx'
writer = pd.ExcelWriter(excel_file_path, engine='xlsxwriter')


# Extracting year, month, and quarter from dates
data_companies['Start Year'] = data_companies['start_date'].dt.year
data_companies['Start Month'] = data_companies['start_date'].dt.to_period('M')
data_companies['Start Quarter'] = data_companies['start_date'].dt.to_period('Q')
data_companies['Termination Year'] = data_companies['end_date'].dt.year
data_companies['Termination Month'] = data_companies['end_date'].dt.to_period('M')
data_companies['Termination Quarter'] = data_companies['end_date'].dt.to_period('Q')

# Function to compute headcount and attrition rates
def compute_headcount_and_attrition(data, period):
    time_periods = sorted(data[f'Start {period}'].unique())
    headcount = {}
    attrition_rates = {}

    for tp in time_periods:
        current_employees = data[(data[f'Start {period}'] <= tp) & (data[f'Termination {period}'].isna() | (data[f'Termination {period}'] >= tp))]
        headcount[tp] = current_employees.shape[0]
        terminations = current_employees[current_employees[f'Termination {period}'] == tp].shape[0]
        attrition_rates[tp] = terminations / headcount[tp] if headcount[tp] > 0 else 0

    return headcount, attrition_rates

# Iterate over each company and perform analysis
for company in data_companies['company'].unique():
    company_data = data_companies[data_companies['company'] == company]

    # Calculate headcount and attrition for each period
    monthly_headcount, monthly_attrition = compute_headcount_and_attrition(company_data, 'Month')
    quarterly_headcount, quarterly_attrition = compute_headcount_and_attrition(company_data, 'Quarter')
    yearly_headcount, yearly_attrition = compute_headcount_and_attrition(company_data, 'Year')

    # Convert to DataFrame and save to Excel
    monthly_df = pd.DataFrame({'Headcount': monthly_headcount, 'Attrition Rate': monthly_attrition})
    quarterly_df = pd.DataFrame({'Headcount': quarterly_headcount, 'Attrition Rate': quarterly_attrition})
    yearly_df = pd.DataFrame({'Headcount': yearly_headcount, 'Attrition Rate': yearly_attrition})


    yearly_df.to_excel(writer, sheet_name=f'{company} Yearly')

# Save and close the writer
writer.close()

# Benchmark analysis

In [10]:
import pandas as pd
import plotly.graph_objects as go

# Load the Excel file
file_path = '../../../data/Incredibuild/HRIS/attrition_and_headcount_data_all_companies.xlsx'

# Read all sheets except the first one
all_sheets = pd.read_excel(file_path, sheet_name=None)
sheets = list(all_sheets.keys())[1:]  # Exclude the first sheet

# Initialize a DataFrame to store aggregated data
all_companies_data = pd.DataFrame()

# Loop through each sheet and aggregate data
for sheet in sheets:
    df = all_sheets[sheet]
    df.columns = ['Year', 'Headcount', 'Attrition Rate']  # Assign column names
    # Aggregate data by year
    yearly_data = df.groupby('Year')['Attrition Rate'].mean().reset_index()
    all_companies_data = pd.concat([all_companies_data, yearly_data], ignore_index=True)

# Calculate average attrition rate per year for all companies
average_attrition = all_companies_data.groupby('Year')['Attrition Rate'].mean().reset_index()

# Load the Incredibuild data (first sheet)
incredibuild_data = pd.read_excel(file_path, sheet_name=0)
incredibuild_data.columns = ['Year', 'Headcount', 'Attrition Rate']  # Assign column names
# Aggregate data by year
incredibuild_yearly = incredibuild_data.groupby('Year')['Attrition Rate'].mean().reset_index()

# Plotting using Plotly
fig = go.Figure()
# Add bar for Incredibuild
fig.add_trace(go.Bar(x=incredibuild_yearly['Year'], y=incredibuild_yearly['Attrition Rate'],
                     name='Incredibuild', marker_color='blue'))
# Add bar for average of all companies
fig.add_trace(go.Bar(x=average_attrition['Year'], y=average_attrition['Attrition Rate'],
                     name='All Companies Average', marker_color='orange'))

# Update layout
fig.update_layout(title='Attrition Rate Comparison: Incredibuild vs All Companies Average',
                  xaxis_title='Year', yaxis_title='Attrition Rate (%)',
                  barmode='group')

# Show the plot
fig.show()


In [12]:
# Merge Incredibuild data with the average of all companies
merged_data = incredibuild_yearly.merge(average_attrition, on='Year', suffixes=('_Incredibuild', '_Average'))

# Rename the columns for clarity
merged_data.columns = ['Year', 'Attrition Rate Incredibuild', 'Attrition Rate Average']

# Specify the path to save the CSV file
output_file_path = '../../../data/Incredibuild/HRIS/incredibuild_vs_average_attrition.xlsx'

# Save the merged data to a CSV file
merged_data.to_excel(output_file_path, index=False)

print(f"File saved as {output_file_path}")

File saved as ../../../data/Incredibuild/HRIS/incredibuild_vs_average_attrition.xlsx
