## 1. Installs, imports and configuration

In [1]:
# Install libraries
%pip install pdfplumber pandas

Collecting pdfplumber
  Obtaining dependency information for pdfplumber from https://files.pythonhosted.org/packages/12/28/3958ed81a9be317610ab73df32f1968076751d651c84dff1bcb45b7c6c0e/pdfplumber-0.11.8-py3-none-any.whl.metadata
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251107 (from pdfplumber)
  Obtaining dependency information for pdfminer.six==20251107 from https://files.pythonhosted.org/packages/64/29/d1d9f6b900191288b77613ddefb73ed35b48fb35e44aaf8b01b0422b759d/pdfminer_six-20251107-py3-none-any.whl.metadata
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Obtaining dependency information for pypdfium2>=4.18.0 from https://files.pythonhosted.org/packages/13/bf/4259b23a88b92bec8199e1a08a0821dbfbb465629c203bdbc49e2f993940/pypdfium2-5.0.0-py3-none-

In [2]:
# Import libraries
import pdfplumber
import pandas as pd
import re

In [None]:
# Configuration for showing 3 decimal points
pd.set_option("display.float_format", lambda x: f"{x:.3f}")

In [None]:
# Configure file paths

fedex_pdf_1 = "files/FedEX 1.pdf"
fedex_pdf_2 = "files/FedEX 2.pdf"
evri_pdf_1 = "files/Evri 1.pdf"

## 2. Helper function to extract text from PDF

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Read all pages of a PDF and return the text as one string.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

## 4. FedEx Parser

In [6]:
def parse_fedex(text):
    """
    This function extracts shipment lines from FedEx PDFs.

    Logic:
      - Read the PDF text line by line.
      - Identify lines that look like a shipment row.
        Example structure:
            <shipment_number> <date_dd/mm/yyyy> FedEx Priority ... <values>
      - Extract the shipment number, the shipment date, and the final numeric
        value on the line, which represents the total charge for that despatch.

    Regex pattern:
      - ^(\\d{9,}) matches a long shipment number at the start of the line.
      - (\\d{2}/\\d{2}/\\d{4}) captures dates like 13/10/2025.
      - \\d+\\.\\d+ finds decimal values such as 2.99, 17.10 and so on.
        The last decimal value on the line is treated as the total charge.
    """

    rows = []

    for line in text.splitlines():
        # Match a shipment line: starts with shipment_number and date
        pattern = r"^(\d{9,})\s+(\d{2}/\d{2}/\d{4})"
        m = re.match(pattern, line)
        if not m:
            continue

        shipment_number = m.group(1)
        shipment_date = m.group(2)

        # Extract all decimal numbers on the line
        nums = re.findall(r"\d+\.\d+", line)
        if not nums:
            continue

        # The last decimal number is the total charge for that shipment
        charge = float(nums[-1])

        rows.append({
            "shipment_number": shipment_number,
            "shipment_date": shipment_date,
            "charge": charge,
            "raw_line": line
        })

    df = pd.DataFrame(rows)

    # Parse date for later use
    if not df.empty:
        df["shipment_date_parsed"] = pd.to_datetime(
            df["shipment_date"],
            format="%d/%m/%Y",
            errors="coerce"
        )

    return df

## 6. Evri Parser

In [None]:
def parse_evri(text):
    """
    This function extracts despatch service lines from Evri PDFs.

    Logic:
      - Read the PDF text line by line.
      - Identify lines that follow the Evri numeric pattern:
            <service text> <quantity> <unit_price> <VAT_code> <line_value>
        Example:
            Scottish Highlands & Islands Parcel 36 5.28 S 190.08
      - Extract:
          * service name (all text before the quantity column)
          * quantity (number of despatches)
          * unit price
          * line value

    Regex pattern explanation:
      - ^\\s* matches any leading spaces at the start of the line.
      - (.+?) captures the full service name as any characters, non greedily,
        stopping just before the quantity column.
      - ([\\d,]+) captures the quantity column which may contain commas.
      - (\\d+\\.\\d+) captures the unit price as a decimal number.
      - [A-Z] matches the VAT code column, for example S or O.
      - ([\\d,]+\\.\\d+) captures the line total value.

    Note:
      - Do not hard code the word Despatch so that lines like
        Scottish Highlands & Islands Parcel 36 5.28 S 190.08
        are captured as valid service rows.
    """

    rows = []

    pattern = r"^\s*(.+?)\s+([\d,]+)\s+(\d+\.\d+)\s+[A-Z]\s+([\d,]+\.\d+)"

    for line in text.splitlines():
        match = re.match(pattern, line)
        if not match:
            continue

        service = match.group(1).strip()
        quantity = int(match.group(2).replace(",", ""))
        price = float(match.group(3))
        value = float(match.group(4).replace(",", ""))

        rows.append({
            "service": service,
            "quantity": quantity,
            "price": price,
            "value": value,
            "raw_line": line
        })

    return pd.DataFrame(rows)

## 7. Extract text from PDFs

In [25]:
fedex_text_1 = extract_text_from_pdf(fedex_pdf_1)
fedex_text_2 = extract_text_from_pdf(fedex_pdf_2)
evri_text_1 = extract_text_from_pdf(evri_pdf_1)

## 8. Parse PDFs into dataframes

In [None]:
fedex_df1 = parse_fedex(fedex_text_1)
fedex_df2 = parse_fedex(fedex_text_2)
fedex_df = pd.concat([fedex_df1, fedex_df2], ignore_index=True)

evri_df = parse_evri(evri_text_1)

print("FedEx rows:", len(fedex_df))
print("Evri rows:", len(evri_df))

fedex_df.head(), evri_df.head()

## 9. Clean Evri data

In [26]:
# Charge lines with a positive value
evri_core = evri_df[evri_df["value"] > 0].copy()

# Excluded rows: headers, rows with 0 value
evri_excluded = evri_df[evri_df["value"] == 0].copy()

print("Evri core rows (used in calculations):", len(evri_core))
print("Evri excluded rows (meta or zero value):", len(evri_excluded))

evri_core.head()

Evri rows: 32

Missing values per column:
service     0
quantity    0
price       0
value       0
raw_line    0
dtype: int64


Unnamed: 0,service,quantity,price,value,raw_line
0,WK35 Std Inv SUPERGROUP INTERNET LIMITED,1,0.0,0.0,WK35 Std Inv SUPERGROUP INTERNET LIMITED 1 0.0...
1,Charges between 26/10/25 to 01/11/25,1,0.0,0.0,Charges between 26/10/25 to 01/11/25 1 0.00 S ...
2,Password for backup & summary: Uew4hw5z,1,0.0,0.0,Password for backup & summary: Uew4hw5z 1 0.00...


## 10. Basic checks

In [28]:
print("FedEx missing values:")
print(fedex_df.isna().sum())

print("\nEvri core missing values:")
print(evri_core.isna().sum())

# Zero or negative charges for FedEx
fedex_anomalies_basic = fedex_df[fedex_df["charge"] <= 0]
print("\nFedEx rows with zero or negative charge:")
display(fedex_anomalies_basic)

Unnamed: 0,service,quantity,price,value,raw_line
0,WK35 Std Inv SUPERGROUP INTERNET LIMITED,1,0.0,0.0,WK35 Std Inv SUPERGROUP INTERNET LIMITED 1 0.0...
1,Charges between 26/10/25 to 01/11/25,1,0.0,0.0,Charges between 26/10/25 to 01/11/25 1 0.00 S ...
2,Password for backup & summary: Uew4hw5z,1,0.0,0.0,Password for backup & summary: Uew4hw5z 1 0.00...


## 11. Fixed cost rates

In [None]:
fixed_rate_fedex = 3.10  # pounds per despatch
fixed_rate_evri = 2.44   # pounds per despatch

## 12. FedEx calculations

In [31]:
# Each FedEx row is one despatch
fedex_despatches = len(fedex_df)

fedex_spend = round(fedex_df["charge"].sum(), 3)
fedex_actual_avg = round(fedex_spend / fedex_despatches, 3) if fedex_despatches > 0 else 0.000
fedex_variance = round(fedex_actual_avg - fixed_rate_fedex, 3)

if fedex_variance > 0:
    fedex_status = "Over the fixed rate"
elif fedex_variance < 0:
    fedex_status = "Under the fixed rate"
else:
    fedex_status = "On the fixed rate"

print("FedEx despatches:", fedex_despatches)
print("FedEx spend:", fedex_spend)
print("FedEx actual avg cost:", fedex_actual_avg)
print("FedEx variance:", fedex_variance)
print("FedEx status:", fedex_status)

FedEx rows: 803
FedEx total spend: 2475.5500000000006
Evri rows: 32
Evri total despatches: 54746
Evri total spend: 43591.130000000005


## 13. Evri calculations

In [33]:
evri_despatches = evri_core["quantity"].sum()
evri_spend = round(evri_core["value"].sum(), 3)
evri_actual_avg = round(evri_spend / evri_despatches, 3) if evri_despatches > 0 else 0.000
evri_variance = round(evri_actual_avg - fixed_rate_evri, 3)

if evri_variance > 0:
    evri_status = "Over the fixed rate"
elif evri_variance < 0:
    evri_status = "Under the fixed rate"
else:
    evri_status = "On the fixed rate"

print("Evri despatches:", evri_despatches)
print("Evri spend:", evri_spend)
print("Evri actual avg cost:", evri_actual_avg)
print("Evri variance:", evri_variance)
print("Evri status:", evri_status)

## 14. Summary table

In [34]:
summary = pd.DataFrame([
    {
        "carrier": "FedEx",
        "despatches": fedex_despatches,
        "spend": fedex_spend,
        "avg_cost_per_despatch": fedex_actual_avg,
        "fixed_rate": fixed_rate_fedex,
        "variance": fedex_variance,
        "status": fedex_status
    },
    {
        "carrier": "Evri",
        "despatches": evri_despatches,
        "spend": evri_spend,
        "avg_cost_per_despatch": evri_actual_avg,
        "fixed_rate": fixed_rate_evri,
        "variance": evri_variance,
        "status": evri_status
    }
])

summary

(803, 2475.55, 3.083, -0.017, 'Under the fixed rate')

## 15. Export CSVs

In [37]:
summary.to_csv("summary_for_dashboard.csv", index=False)
fedex_df.to_csv("fedex_cleaned.csv", index=False)
evri_core.to_csv("evri_cleaned.csv", index=False)
evri_excluded.to_csv("evri_excluded.csv", index=False)

print("Files created:")
print("summary_for_dashboard.csv")
print("fedex_cleaned.csv")
print("evri_cleaned.csv")
print("evri_excluded.csv")

Files created:
summary_for_dashboard.csv
fedex_cleaned.csv
evri_cleaned.csv
evri_excluded.csv
