<a href="https://colab.research.google.com/github/cbonnin88/Compensation_Analysis/blob/main/compensation_analyst_paystub_UK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re as r
import polars as pl
import plotly.express as px
from google.colab import auth
import gspread
from google.auth import default
from gspread_dataframe import set_with_dataframe

In [3]:
uk_payslip = """
Pay Statement
Private and Confidential
Innovate Analytics Ltd.
123 Tech Avenue, London, EC1R 0AT

Employee Details
Name: John Smith
Address: 456 Data Drive, London, SW1A 0AA
NI Number: QQ 12 34 56 A
Payroll No: 78910

Pay Period Details
Pay Date: 28/07/2025
Tax Period: 04
Tax Code: 1257L
PAYE Ref: 123/AB45678

Payments
Basic Pay: £4,583.33
Gross Pay: £4,583.33

Deductions
Income Tax: £707.17
National Insurance: £282.87
Pension (Employee): £229.17
Total Deductions: £1,219.21

Net Pay: £3,364.12

Year to Date Summary
Gross Pay YTD: £18,333.32
Taxable Pay YTD: £14,145.32
Tax Paid YTD: £2,828.68
NI Paid YTD: £1,131.48
"""

In [4]:
# Creating my regex patterns to extract the relative information
patterns = {
    'name':r'Name: \s*(.*)',
    'pay_date':r'Pay Date:\s*([\d/]+)',
    'basic_pay':r'Basic Pay:\s*£([\d,]+\.\d{2})',
    'gross_pay':r'Gross Pay:\s*£([\d,]+\.\d{2})',
    'income_tax':r'Income Tax:\s*£([\d,]+\.\d{2})',
    'national_insurance':r'National Insurance:\s*£([\d,]+\.\d{2})',
    'pension_employee':r'Pension \(Employee\):\s*£([\d,]+\.\d{2})',
    'net_pay':r'Net Pay:\s*£([\d,]+\.\d{2})',
    'gross_pay_ytd':r'Gross Pay YTD:\s*£([\d,]+\.\d{2})',
    'tax_paid_ytd':r'Tax Paid YTD:\s*£([\d,]+\.\d{2})'
}

pay_data = {k: r.search(v,uk_payslip).group(1) if r.search(v,uk_payslip) else None for k, v in patterns.items()}

In [5]:
# Creating the dataframe for the pay stub with Python Polars library
df_uk_pay = pl.DataFrame([pay_data])
df_uk_pay = df_uk_pay.with_columns([
    pl.col(col).str.replace_all(',','').cast(pl.Float64)
    for col in df_uk_pay.columns if col not in ['name','pay_date']
])

display(df_uk_pay)

name,pay_date,basic_pay,gross_pay,income_tax,national_insurance,pension_employee,net_pay,gross_pay_ytd,tax_paid_ytd
str,str,f64,f64,f64,f64,f64,f64,f64,f64
"""John Smith""","""28/07/2025""",4583.33,4583.33,707.17,282.87,229.17,3364.12,18333.32,2828.68


# **Adding Extra Calculations**

In [6]:
df_uk_pay = df_uk_pay.with_columns([
    ((pl.col('income_tax') / pl.col('gross_pay')) * 100).alias('tax_rate_pct').round(1),
    ((pl.col('national_insurance') / pl.col('gross_pay'))* 100).alias('ni_rate_pct').round(1),
    ((pl.col('pension_employee') / pl.col('gross_pay')) * 100).alias('pension_rate_pct').round(1)
])

df_uk_pay

name,pay_date,basic_pay,gross_pay,income_tax,national_insurance,pension_employee,net_pay,gross_pay_ytd,tax_paid_ytd,tax_rate_pct,ni_rate_pct,pension_rate_pct
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""John Smith""","""28/07/2025""",4583.33,4583.33,707.17,282.87,229.17,3364.12,18333.32,2828.68,15.4,6.2,5.0


# **Deductions Chart**

In [7]:
deductions = {
    'Income Tax': df_uk_pay['income_tax'][0],
    'National Insurance':df_uk_pay['national_insurance'][0],
    'Pension':df_uk_pay['pension_employee'][0]
}

display(deductions)

{'Income Tax': 707.17, 'National Insurance': 282.87, 'Pension': 229.17}

In [8]:
fig_deductions = px.pie(
    names=list(deductions.keys()),
    values=(deductions.values()),
    title='Deductions Breakdown',
    color=list(deductions.keys()),
    color_discrete_sequence=px.colors.sequential.Viridis_r
)

fig_deductions.show()

# **Gross Pay vs Total Deductions**

In [9]:
df_plot = df_uk_pay.with_columns(
    (pl.col('income_tax') + pl.col('national_insurance') + pl.col('pension_employee')).alias('total_deductions')
).to_pandas()

display(df_plot)

Unnamed: 0,name,pay_date,basic_pay,gross_pay,income_tax,national_insurance,pension_employee,net_pay,gross_pay_ytd,tax_paid_ytd,tax_rate_pct,ni_rate_pct,pension_rate_pct,total_deductions
0,John Smith,28/07/2025,4583.33,4583.33,707.17,282.87,229.17,3364.12,18333.32,2828.68,15.4,6.2,5.0,1219.21


In [10]:
df_plot = df_plot.rename(columns={
    'gross_pay':'Gross Pay',
    'total_deductions':'Total Deductions'
})

fig_bar = px.bar(
    df_plot,
    x='pay_date',
    y=['Gross Pay','Total Deductions'],
    barmode='group',
    title='Gross Pay vs Total Deductions',
    labels={'value':'Amount (£)',
           'pay_date':'Pay Date'},
    color_discrete_sequence=px.colors.qualitative.Pastel,
    text_auto=True
)
fig_bar.show()

# **Breakdown of Deductions**

In [11]:
df_plot_2 = df_uk_pay.to_pandas()

df_deductions = df_plot_2.melt(
    id_vars=['pay_date'],
    value_vars=['income_tax','national_insurance','pension_employee','gross_pay'],
    var_name='Deduction Type',
    value_name='Amount'
)

display(df_deductions)

Unnamed: 0,pay_date,Deduction Type,Amount
0,28/07/2025,income_tax,707.17
1,28/07/2025,national_insurance,282.87
2,28/07/2025,pension_employee,229.17
3,28/07/2025,gross_pay,4583.33


In [12]:
df_deductions['Deduction Type'] = df_deductions['Deduction Type'].replace({
    'income_tax':'Income Tax',
    'national_insurance':'National Insurance',
    'pension_employee':'Pension Insurance',
    'gross_pay':'Gross Pay'
})

fig_bar_2 = px.bar(
    df_deductions,
    x='pay_date',
    y='Amount',
    color='Deduction Type',
    barmode='group',
    labels={
        'pay_date':'Pay Date',
        'Amount':'Amount (£)',
        'Deduction Type':'type of Deduction',
        'income_tax':'Income Tax',
        'national_insurance':'National Insurance',
        'pension_employee':'Pension Insurance',
        'gross_pay':'Gross Pay'
    },
    color_discrete_sequence=px.colors.qualitative.Pastel,
    text_auto=True
)

fig_bar_2.show()

# **Breakdown of Deductions (Percentage)**

In [13]:
# Calculate Percentages with Polars

df_percent = df_uk_pay.with_columns([
    (pl.col('income_tax') / pl.col('gross_pay')* 100).alias('income_tax_pct'),
    (pl.col('national_insurance') / pl.col('gross_pay')* 100).alias('national_insurance_pct'),
    (pl.col('pension_employee') / pl.col('gross_pay')* 100).alias('pension_employee_pct'),
])

df_percent = df_percent.with_columns(
    (100 - (pl.col('income_tax_pct') + pl.col('national_insurance_pct')+ pl.col('pension_employee_pct'))).alias('Gross Pay (Net %)')
)

df_percent = df_percent.select([
    'pay_date',
    'Gross Pay (Net %)',
    'income_tax_pct',
    'national_insurance_pct',
    'pension_employee_pct'
])

df_percent_pd = df_percent.to_pandas()

In [14]:
# Preparing Data for Plotly (melt into long format)

df_melted = df_percent_pd.melt(
    id_vars=['pay_date'],
    value_vars=['Gross Pay (Net %)','income_tax_pct','national_insurance_pct','pension_employee_pct'],
    var_name='Component',
    value_name='Percentage'
)

label_map = {
    'Gross Pay (Net %)':'Gross Pay (Net %)',
    'income_tax_pct':'Income Tax (%)',
    'national_insurance_pct':'National Insurance (%)',
    'pension_employee_pct':'Pension Insurance (%)'
}

df_melted['Component'] = df_melted['Component'].map(label_map)

display(df_melted)

Unnamed: 0,pay_date,Component,Percentage
0,28/07/2025,Gross Pay (Net %),73.399035
1,28/07/2025,Income Tax (%),15.429175
2,28/07/2025,National Insurance (%),6.171714
3,28/07/2025,Pension Insurance (%),5.000076


In [15]:
fig_pct_deduction = px.bar(
    df_melted,
    x='pay_date',
    y='Percentage',
    color='Component',
    barmode='group',
    title='Monthly Deduction Breakdown (% of Gross Pay)',
    labels={
        'pay_date':'Pay Date',
        'Percentage':'Percentage (%)',
        'Component':'Type of Deduction',
    },
    color_discrete_sequence=px.colors.qualitative.Pastel,
    text_auto='.1f'
)

fig_pct_deduction.show()