# Attrition using Linkedin ODP data

In [9]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

# Load the dataset
data = pd.read_csv('../../../data/Incredibuild/HRIS/001_INCREDIBUILD_ALL_PROFILES.csv', delimiter=';')
data_companies = pd.read_csv('../../../data/Incredibuild/HRIS/001_ALL_PROFILES.csv', delimiter=';')


In [10]:
data_companies.head()

Unnamed: 0,profile_id,sequence_nbr,start_date,end_date,company,title,country,region,location,is_title_d_and_a,tk_title_standardized_original_language,tk_title_original_language,tk_title_standardized_english,tk_title_standardized_group,tk_title_standardized_class,is_title_manager,odp_function,mck_function
0,d4333688-c76f-4ccf-822c-1e926295cc81,2,01/01/2011,,Global-e Ltd.,Supervisor,Argentina,,,False,Spanish,Consultor (Otros),Consultant (other),Other Professions,Other,False,Digital & Analytics,Analytics
1,0f8cba01-fdd6-4eb9-b1d4-376a73fb3de4,10,09/01/2021,,Deel,Marketing Director LATAM,Argentina,Buenos Aires,,False,,,,,,True,Marketing & Sales,Marketing
2,13697700-11c0-4567-ab56-03623236db1d,6,06/01/2022,,Deel,FinTech Customer Operations Specialist,Argentina,,,False,,,,,,False,Operations,Operations
3,d4333688-c76f-4ccf-822c-1e926295cc81,3,03/01/2012,,Global-e Ltd.,Encargado de turnos,Argentina,,,False,Spanish,Gerente de sucursal,Branch Manager,Sales Managers,Sales and Trading,False,Operations,Operations
4,897a8fa5-4fa1-41f8-bef4-bba5c7a69f2d,4,11/01/2014,,Global-e Ltd.,Analista en RRHH,Argentina,,,False,Spanish,Analista (otros),Analyst (other),Management Analysts and Consultants,"Management, Policy and Governance",False,Marketing & Sales,Communications


In [11]:
# Data Preprocessing

# Converting 'start_date' and 'end_date' to datetime format
data['start_date'] = pd.to_datetime(data['start_date'], format='%d/%m/%Y')
data['end_date'] = pd.to_datetime(data['end_date'], format='%d/%m/%Y', errors='coerce')

# Filling NaN values in 'end_date' with a placeholder far future date for ongoing employment
data['end_date'].fillna(pd.Timestamp('2025-01-01'), inplace=True)

# Extracting year, month, and quarter from dates
data['Start Year'] = data['start_date'].dt.year
data['Termination Year'] = data['end_date'].dt.year

# Function to compute attrition rate
def compute_attrition(data, function):
    # Filter data for the function
    function_data = data[data['odp_function'] == function]
    years = sorted(function_data['Start Year'].unique())
    attrition_rates = {}
    
    for year in years:
        # Employees at the start of the year
        start_count = function_data[function_data['Start Year'] <= year].shape[0]
        # Employees terminated during the year
        term_count = function_data[(function_data['Termination Year'] == year) & (function_data['Start Year'] <= year)].shape[0]
        # Attrition rate
        attrition_rates[year] = term_count / start_count if start_count > 0 else 0
    
    return attrition_rates

# Compute attrition rates for all functions
functions = data['odp_function'].unique()
function_attrition = {function: compute_attrition(data, function) for function in functions}

# Save the attrition rates to an Excel file
with pd.ExcelWriter('attrition_rates_by_function.xlsx') as writer:
    for function, rates in function_attrition.items():
        # Convert function to string and handle NaN values
        function_str = str(function) if pd.notnull(function) else 'Unknown Function'
        # Ensure the sheet name is not longer than 31 characters
        function_str = function_str[:31]
        
        attrition_df = pd.DataFrame(list(rates.items()), columns=['Year', 'Attrition Rate'])
        attrition_df.to_excel(writer, sheet_name=function_str, index=False)


# Create a figure with subplots
fig = go.Figure()

# Add a trace for each function
for function, attrition in function_attrition.items():
    fig.add_trace(go.Scatter(
        x=list(attrition.keys()),
        y=list(attrition.values()),
        mode='lines+markers',
        name=function
    ))

# Update layout
fig.update_layout(
    title='Incredibuild Attrition Rate per Year per Function',
    xaxis_title='Year',
    yaxis_title='Attrition Rate',
    legend_title='Function'
)

# Show the figure
fig.show()


In [12]:
# Data Preprocessing

# Ensure the 'start_date' and 'end_date' are in datetime format
# Using infer_datetime_format to automatically detect the format
data_companies['start_date'] = pd.to_datetime(data_companies['start_date'], errors='coerce', dayfirst=True)
data_companies['end_date'] = pd.to_datetime(data_companies['end_date'], errors='coerce', dayfirst=True)

# Fill NaN values in 'end_date'
data_companies['end_date'].fillna(pd.Timestamp('2025-01-01'), inplace=True)

# Extract years from 'start_date' and 'end_date'
data_companies['Start Year'] = data_companies['start_date'].dt.year
data_companies['Termination Year'] = data_companies['end_date'].dt.year

# Function to compute attrition rate
def compute_attrition(data, company, function):
    # Filter data for the company and function
    function_data = data[(data['company'] == company) & (data['odp_function'] == function)]
    years = sorted(function_data['Start Year'].unique())
    attrition_rates = {}
    
    for year in years:
        # Employees at the start of the year
        start_count = function_data[function_data['Start Year'] <= year].shape[0]
        # Employees terminated during the year
        term_count = function_data[(function_data['Termination Year'] == year) & (function_data['Start Year'] <= year)].shape[0]
        # Attrition rate
        attrition_rates[year] = term_count / start_count if start_count > 0 else 0
    
    return attrition_rates

# Compute attrition rates for all companies and functions
companies = data_companies['company'].unique()
functions = data_companies['odp_function'].unique()

# Save the attrition rates to an Excel file
with pd.ExcelWriter('attrition_rates_by_company_and_function.xlsx') as writer:
    for company in companies:
        for function in functions:
            attrition_rates = compute_attrition(data_companies, company, function)
            attrition_df = pd.DataFrame(list(attrition_rates.items()), columns=['Year', 'Attrition Rate'])
            # Ensure the sheet name is not longer than 31 characters
            sheet_name = f"{company}_{function}"[:31]
            # Write to sheet
            attrition_df.to_excel(writer, sheet_name=sheet_name, index=False)

In [13]:
# Function to compute average attrition rate per year for each function
def compute_avg_attrition(data, function):
    function_data = data[data['odp_function'] == function]
    attrition_rates = {}

    for year in np.sort(function_data['Start Year'].unique()):
        year_data = function_data[function_data['Start Year'] <= year]
        start_count = year_data.shape[0]
        term_count = year_data[(year_data['Termination Year'] == year) & (year_data['Start Year'] <= year)].shape[0]
        attrition_rate = term_count / start_count if start_count > 0 else 0
        attrition_rates[year] = attrition_rate

    return attrition_rates

# Compute average attrition rates for each function
functions = data_companies['odp_function'].unique()
function_attrition = {function: compute_avg_attrition(data_companies, function) for function in functions}

# Save the average attrition rates to an Excel file
with pd.ExcelWriter('average_attrition_rates_by_function.xlsx') as writer:
    for function, rates in function_attrition.items():
        attrition_df = pd.DataFrame(list(rates.items()), columns=['Year', 'Average Attrition Rate'])
        # Ensure the sheet name is not longer than 31 characters
        sheet_name = str(function)[:31] if pd.notnull(function) else 'Unknown Function'
        # Write to sheet
        attrition_df.to_excel(writer, sheet_name=sheet_name, index=False)

# For all companies

In [13]:
data_companies.head()

Unnamed: 0,profile_id,sequence_nbr,start_date,end_date,company,title,country,region,location,is_title_d_and_a,...,tk_title_standardized_class,is_title_manager,odp_function,mck_function,Start Year,Start Month,Start Quarter,Termination Year,Termination Month,Termination Quarter
0,d4333688-c76f-4ccf-822c-1e926295cc81,2,2011-01-01,2025-01-01,Global-e Ltd.,Supervisor,Argentina,,,False,...,Other,False,Digital & Analytics,Analytics,2011.0,2011-01,2011Q1,2025,2025-01,2025Q1
1,0f8cba01-fdd6-4eb9-b1d4-376a73fb3de4,10,2021-01-09,2025-01-01,Deel,Marketing Director LATAM,Argentina,Buenos Aires,,False,...,,True,Marketing & Sales,Marketing,2021.0,2021-01,2021Q1,2025,2025-01,2025Q1
2,13697700-11c0-4567-ab56-03623236db1d,6,2022-01-06,2025-01-01,Deel,FinTech Customer Operations Specialist,Argentina,,,False,...,,False,Operations,Operations,2022.0,2022-01,2022Q1,2025,2025-01,2025Q1
3,d4333688-c76f-4ccf-822c-1e926295cc81,3,2012-01-03,2025-01-01,Global-e Ltd.,Encargado de turnos,Argentina,,,False,...,Sales and Trading,False,Operations,Operations,2012.0,2012-01,2012Q1,2025,2025-01,2025Q1
4,897a8fa5-4fa1-41f8-bef4-bba5c7a69f2d,4,2014-01-11,2025-01-01,Global-e Ltd.,Analista en RRHH,Argentina,,,False,...,"Management, Policy and Governance",False,Marketing & Sales,Communications,2014.0,2014-01,2014Q1,2025,2025-01,2025Q1


In [7]:
# Data Preprocessing
# Converting 'start_date' and 'end_date' to datetime format
data_companies['start_date'] = pd.to_datetime(data_companies['start_date'], format='%d/%m/%Y', errors='coerce')
data_companies['end_date'] = pd.to_datetime(data_companies['end_date'], format='%d/%m/%Y', errors='coerce')

# Filling NaN values in 'end_date' with a placeholder far future date for ongoing employment
data_companies['end_date'].fillna(pd.Timestamp('2025-01-01'), inplace=True)



# Create a Pandas Excel writer
excel_file_path = '../../../data/Incredibuild/HRIS/attrition_and_headcount_data_all_companies.xlsx'
writer = pd.ExcelWriter(excel_file_path, engine='xlsxwriter')


# Extracting year, month, and quarter from dates
data_companies['Start Year'] = data_companies['start_date'].dt.year
data_companies['Start Month'] = data_companies['start_date'].dt.to_period('M')
data_companies['Start Quarter'] = data_companies['start_date'].dt.to_period('Q')
data_companies['Termination Year'] = data_companies['end_date'].dt.year
data_companies['Termination Month'] = data_companies['end_date'].dt.to_period('M')
data_companies['Termination Quarter'] = data_companies['end_date'].dt.to_period('Q')

# Function to compute headcount and attrition rates
def compute_headcount_and_attrition(data, period):
    time_periods = sorted(data[f'Start {period}'].unique())
    headcount = {}
    attrition_rates = {}

    for tp in time_periods:
        current_employees = data[(data[f'Start {period}'] <= tp) & (data[f'Termination {period}'].isna() | (data[f'Termination {period}'] >= tp))]
        headcount[tp] = current_employees.shape[0]
        terminations = current_employees[current_employees[f'Termination {period}'] == tp].shape[0]
        attrition_rates[tp] = terminations / headcount[tp] if headcount[tp] > 0 else 0

    return headcount, attrition_rates

# Iterate over each company and perform analysis
for company in data_companies['company'].unique():
    company_data = data_companies[data_companies['company'] == company]

    # Calculate headcount and attrition for each period
    monthly_headcount, monthly_attrition = compute_headcount_and_attrition(company_data, 'Month')
    quarterly_headcount, quarterly_attrition = compute_headcount_and_attrition(company_data, 'Quarter')
    yearly_headcount, yearly_attrition = compute_headcount_and_attrition(company_data, 'Year')

    # Convert to DataFrame and save to Excel
    monthly_df = pd.DataFrame({'Headcount': monthly_headcount, 'Attrition Rate': monthly_attrition})
    quarterly_df = pd.DataFrame({'Headcount': quarterly_headcount, 'Attrition Rate': quarterly_attrition})
    yearly_df = pd.DataFrame({'Headcount': yearly_headcount, 'Attrition Rate': yearly_attrition})


    yearly_df.to_excel(writer, sheet_name=f'{company} Yearly')

# Save and close the writer
writer.close()

# Benchmark analysis

In [10]:
import pandas as pd
import plotly.graph_objects as go

# Load the Excel file
file_path = '../../../data/Incredibuild/HRIS/attrition_and_headcount_data_all_companies.xlsx'

# Read all sheets except the first one
all_sheets = pd.read_excel(file_path, sheet_name=None)
sheets = list(all_sheets.keys())[1:]  # Exclude the first sheet

# Initialize a DataFrame to store aggregated data
all_companies_data = pd.DataFrame()

# Loop through each sheet and aggregate data
for sheet in sheets:
    df = all_sheets[sheet]
    df.columns = ['Year', 'Headcount', 'Attrition Rate']  # Assign column names
    # Aggregate data by year
    yearly_data = df.groupby('Year')['Attrition Rate'].mean().reset_index()
    all_companies_data = pd.concat([all_companies_data, yearly_data], ignore_index=True)

# Calculate average attrition rate per year for all companies
average_attrition = all_companies_data.groupby('Year')['Attrition Rate'].mean().reset_index()

# Load the Incredibuild data (first sheet)
incredibuild_data = pd.read_excel(file_path, sheet_name=0)
incredibuild_data.columns = ['Year', 'Headcount', 'Attrition Rate']  # Assign column names
# Aggregate data by year
incredibuild_yearly = incredibuild_data.groupby('Year')['Attrition Rate'].mean().reset_index()

# Plotting using Plotly
fig = go.Figure()
# Add bar for Incredibuild
fig.add_trace(go.Bar(x=incredibuild_yearly['Year'], y=incredibuild_yearly['Attrition Rate'],
                     name='Incredibuild', marker_color='blue'))
# Add bar for average of all companies
fig.add_trace(go.Bar(x=average_attrition['Year'], y=average_attrition['Attrition Rate'],
                     name='All Companies Average', marker_color='orange'))

# Update layout
fig.update_layout(title='Attrition Rate Comparison: Incredibuild vs All Companies Average',
                  xaxis_title='Year', yaxis_title='Attrition Rate (%)',
                  barmode='group')

# Show the plot
fig.show()


In [12]:
# Merge Incredibuild data with the average of all companies
merged_data = incredibuild_yearly.merge(average_attrition, on='Year', suffixes=('_Incredibuild', '_Average'))

# Rename the columns for clarity
merged_data.columns = ['Year', 'Attrition Rate Incredibuild', 'Attrition Rate Average']

# Specify the path to save the CSV file
output_file_path = '../../../data/Incredibuild/HRIS/incredibuild_vs_average_attrition.xlsx'

# Save the merged data to a CSV file
merged_data.to_excel(output_file_path, index=False)

print(f"File saved as {output_file_path}")

File saved as ../../../data/Incredibuild/HRIS/incredibuild_vs_average_attrition.xlsx


In [18]:
def calculate_attrition_by_category(data, category):
    # Preprocess and extract year from start and end dates
    data['start_date'] = pd.to_datetime(data['start_date'], format='%d/%m/%Y', errors='coerce')
    data['end_date'] = pd.to_datetime(data['end_date'], format='%d/%m/%Y', errors='coerce')
    data['end_date'].fillna(pd.Timestamp('2100-01-01'), inplace=True)
    data['Start Year'] = data['start_date'].dt.year
    data['Termination Year'] = data['end_date'].dt.year

    # Initialize dictionaries for headcount and attrition
    headcount_start = {}
    headcount_end = {}
    attrition = {}

    # Calculate headcount and attrition for each category and year
    for cat in data[category].unique():
        cat_data = data[data[category] == cat]
        headcount_start[cat] = cat_data.groupby('Start Year').size()
        headcount_end[cat] = cat_data.groupby('Termination Year').size()

        for year in cat_data['Start Year'].unique():
            start_count = headcount_start[cat].get(year, 0)
            end_count = headcount_end[cat].get(year, 0)
            avg_headcount = (start_count + end_count) / 2
            terminations = cat_data[cat_data['Termination Year'] == year].shape[0]
            attrition_rate = terminations / avg_headcount if avg_headcount > 0 else 0
            attrition.setdefault(cat, {})[year] = attrition_rate

    return headcount_start, attrition

# Usage example with 'function_odp' as the category
category = 'odp_function'  # Replace with the actual column name
headcount, attrition = calculate_attrition_by_category(data, category)

# Visualize the attrition rates
fig = go.Figure()
for cat, yearly_rates in attrition.items():
    years = list(yearly_rates.keys())
    rates = list(yearly_rates.values())
    fig.add_trace(go.Bar(x=years, y=rates, name=cat))

fig.update_layout(title='Yearly Attrition Rate per Function',
                  xaxis_title='Year', yaxis_title='Attrition Rate',
                  barmode='group')
fig.show()
