In [1]:
import os
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.kplc.co.ke/customer-support"
SAVE_DIR = "kplc_pdfs"

os.makedirs(SAVE_DIR, exist_ok=True)

def get_pdf_links(page):
    url = f"{BASE_URL}?page={page}#powerschedule"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    
    pdf_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.endswith(".pdf") and "storage" in href:
            pdf_links.append(href)
    return pdf_links

def download_pdf(url, save_dir=SAVE_DIR):
    filename = url.split("/")[-1]
    filepath = os.path.join(save_dir, filename)
    if not os.path.exists(filepath):  # avoid duplicates
        print(f"Downloading: {filename}")
        r = requests.get(url)
        with open(filepath, "wb") as f:
            f.write(r.content)

# Loop through the 7 pages
all_pdfs = []
for page in range(1, 8):
    links = get_pdf_links(page)
    all_pdfs.extend(links)

# Remove duplicates just in case
all_pdfs = list(set(all_pdfs))

# Download PDFs
for pdf in all_pdfs:
    download_pdf(pdf)


Downloading: 01K09726RVCV6HJN2ZCCSMP4EQ.pdf
Downloading: 01JZ7KGAR4GPMNPMRCM5P1M9ST.pdf
Downloading: 01JP7XEH05QF9NJM4TGRY2RMTY.pdf
Downloading: 01JYA1DANZCTPZA6ZC6HMHVKDD.pdf
Downloading: Kenya_Power_Sustainability_Strategy_2024_18th_Nov.pdf
Downloading: 01J1FA2JXK3QNX0PM6YAP0P3T2.pdf
Downloading: Final_E_MOBILITY_CONFERENCE_REPORT_(DRAFT_10).pdf
Downloading: 01JS1XT0CJA8PCTTCWZ8FS8H8M.pdf
Downloading: 01J8PJD2N8FWYSXJ3AD11TQH8C.pdf
Downloading: 01JWE3DAJTH2DBTJK6BZGEZDK9.pdf
Downloading: Customer_Complaint_Policy_and_Process_7_in_x_9.5_in_final.pdf
Downloading: 01JTR3EEJMZGWYZXDV0TXP9XH0.pdf
Downloading: 01JF74GM6PZ4W0V431G0CW2J28.pdf
Downloading: 01JDSFBFT07Q0ARPMF06P19WF7.pdf
Downloading: 01J7GBQNFHE57VDQ3VE5P79VM5.pdf
Downloading: 01JVCQ8RJYH48W20C99ZRZNVCN.pdf
Downloading: 01K1JKNRVT8MN93RGC2KPH6248.pdf
Downloading: 01JCPTZ0W41NNE0Q0M14HE7P5Q.pdf
Downloading: 01K136H6CNFRF7PMN1S0DY1S6S.pdf
Downloading: 01JM1N87MTDJ5242CE8GGQBCRP.pdf
Downloading: 01JPSEJ4JPEP01G1Q4VXF4NKAH.pdf
Dow

In [2]:
len(all_pdfs)

61

# Extract pdf text

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
import PyPDF2

# Open the PDF
with open("/kaggle/working/kplc_pdfs/01J1FA2JXK3QNX0PM6YAP0P3T2.pdf", "rb") as f:
    reader = PyPDF2.PdfReader(f)
    print(f"Number of pages: {len(reader.pages)}")
    
    # Read first page
    first_page = reader.pages[0]
    text = first_page.extract_text()
    print(text)


Number of pages: 2
                                                                                                         Interruption of  
Electricity Supply   
Notice is hereby given under R ule 27 of the Electric Power Rules  
That the electricity supply will be interrupted as here under:  
(It is necessary to interrupt supply periodically in order to facilitate 
maintenance and upgrade of power lines to the network; to connect new 
customers or to replace power lines during road construction, etc.)  
 
NAIROBI REGION  
AREA: PART OF EASTLEIGH  
DATE: Sunday 30.06.2024                                             TIME: 9.00 A.M. – 5.00 P.M.  
Mohammed Yusuf Haji Ave, I smail Hassan, Moyale Mall, Sirare, Taran, Pumwani 
Hosp, California, Care Hosp, Mabruk & adjacent customers.  
 
AREA: KYANGOMBE, SYOKIMAU  
DATE: Sunday 30.06.2024                                             TIME: 9.00 A.M.  – 5.00 P.M.  
Inland Container Deport (I CD), Kyang’ombe, Heavy Engineering, Tile & 
Carpet,

In [5]:
import re
import json
import csv

def parse_schedule(raw_text):
    # Clean spacing
    text = re.sub(r"\s+", " ", raw_text)

    # Find all REGION/COUNTY headers (all caps words ending with REGION/COUNTY)
    headers = re.finditer(r"([A-Z\s]+(?:REGION|COUNTY))", text)
    header_positions = [(m.start(), m.group().strip()) for m in headers]

    # Split into AREA blocks
    blocks = re.split(r"AREA:", text)[1:]  # skip before first AREA

    results = []

    for block in blocks:
        # Find nearest header before this block
        block_start = text.find(block)
        region = None
        for pos, header in reversed(header_positions):
            if pos < block_start:
                region = header
                break

        # Extract area name
        area_match = re.match(r"\s*([^D]+?)\s+DATE:", block)
        area = area_match.group(1).strip() if area_match else None

        # Extract date
        date_match = re.search(r"DATE:\s*([A-Za-z]+\s+\d{2}\.\d{2}\.\d{4})", block)
        date = date_match.group(1) if date_match else None

        # Extract time
        time_match = re.search(r"TIME[: ]+\s*([\d\.A.MP\-\s]+)", block)
        time = time_match.group(1).strip() if time_match else None

        # Extract locations (everything after TIME)
        locations_match = re.search(r"TIME[: ]+.*?\s+(.+)", block)
        locations_raw = locations_match.group(1) if locations_match else ""
        locations = [loc.strip() for loc in locations_raw.split(",") if loc.strip()]

        results.append({
            "region": region,
            "area": area,
            "date": date,
            "time": time,
            "locations": locations
        })

    return results


# Example usage
raw_text = text
parsed = parse_schedule(raw_text)

# Save JSON
with open("kplc_schedule.json", "w") as f:
    json.dump(parsed, f, indent=4)

# Save CSV
with open("kplc_schedule.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["region", "area", "date", "time", "locations"])
    writer.writeheader()
    for row in parsed:
        writer.writerow({
            "region": row["region"],
            "area": row["area"],
            "date": row["date"],
            "time": row["time"],
            "locations": "; ".join(row["locations"])
        })


In [6]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfi

In [7]:
import pdfplumber
import re
import pandas as pd

def clean_text(text):
    text = text.replace("â€“", "–").replace("â€™", "'")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def parse_pdf(pdf_path):
    rows = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = clean_text(page.extract_text())

            # Split on 'DATE:' to chunk schedule entries
            entries = re.split(r"(?=DATE:)", text)
            for entry in entries:
                if not entry.strip():
                    continue

                # Date
                date_match = re.search(r"(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday)\s+\d{2}\.\d{2}\.\d{4}", entry)
                date = date_match.group(0) if date_match else ""

                # Start & End times
                time_match = re.search(r"(\d{1,2}\.\d{2}\s*(?:A\.M\.|P\.M\.))\s*–\s*(\d{1,2}\.\d{2}\s*(?:A\.M\.|P\.M\.))", entry)
                start_time, end_time = time_match.groups() if time_match else ("", "")

                # Region / Area
                region_match = re.search(r"(COUNTY AREA:|REGION AREA:|PARTS OF [A-Z]+|COUNTY [A-Za-z ]+)", entry)
                region = region_match.group(0) if region_match else ""

                # Locations (everything after the end time)
                if time_match:
                    locations = entry[time_match.end():].strip()
                else:
                    locations = entry.strip()

                rows.append([region, date, start_time, end_time, locations])

    df = pd.DataFrame(rows, columns=["region", "date", "start_time", "end_time", "locations"])
    return df

df = parse_pdf("/kaggle/working/kplc_pdfs/01JA7Y3RMRZVR9NNYMC4VRH4ZN.pdf")
df.to_csv("power_schedule_clean.csv", index=False)
print(df.head(10))


             region                  date  start_time   end_time  \
0   PARTS OF KISUMU                                                
1      REGION AREA:     Sunday 13.10.2024   9.00 A.M.  3.00 P.M.   
2                       Monday 14.10.2024  10.00 A.M.  4.00 P.M.   
3                      Tuesday 15.10.2024   9.00 A.M.  5.00 P.M.   
4                      Tuesday 15.10.2024                          
5  PARTS OF BUNGOMA  Wednesday 16.10.2024   9.00 A.M.  5.00 P.M.   
6                      Tuesday 15.10.2024   9.00 A.M.  5.00 P.M.   
7                      Tuesday 15.10.2024   8.00 A.M.  4.00 P.M.   
8   PARTS OF MIGORI  Wednesday 16.10.2024                          
9                    Wednesday 16.10.2024                          

                                           locations  
0  Power Maintenance Notice WESTERN REGION Notice...  
1  customers or to replace power lines during roa...  
2  AREA: PART OF MATHARE Kibos Prison, Cibol Indu...  
3  Kajulu Est, Kindo, Kianja, O

In [8]:
df.columns

Index(['region', 'date', 'start_time', 'end_time', 'locations'], dtype='object')

In [9]:
import pdfplumber

with pdfplumber.open("/kaggle/working/kplc_pdfs/01JA7Y3RMRZVR9NNYMC4VRH4ZN.pdf") as pdf:
    print(f"Number of pages: {len(pdf.pages)}")
    
    # Extract text from all pages
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        print(f"\n--- Page {i+1} ---\n{text}")


Number of pages: 2

--- Page 1 ---
Power Maintenance Notice WESTERN REGION
Notice is hereby given under Rule 27 of the Electric Power Rules
PARTS OF KISUMU COUNTY
That the electricity supply will be interrupted as here under:
(It is necessary to interrupt supply periodically in order to facilitate AREA: KEDA CERAMICS
maintenance and upgrade of power lines to the network; to connect new DATE: Sunday 13.10.2024 TIME: 9.00 A.M. – 3.00 P.M.
customers or to replace power lines during road construction, etc.) Chepsweta, Kibigori Junction, KEDA Ceramics, Miwani, Jagir Sing & adjacent
customers.
NAIROBI REGION
AREA: CHIGA, KIBOS SUGAR RESEARCH
DATE: Monday 14.10.2024 TIME: 10.00 A.M. – 4.00 P.M.
AREA: PART OF MATHARE Kibos Prison, Cibol Industries, White Coal, Kibos Sugar Research, Properties
DATE: Tuesday 15.10.2024 TIME: 9.00 A.M. – 5.00 P.M. Kajulu Est, Kindo, Kianja, Okoko, Kajulu Waters, Bungu, Pifo Boys, Kasram, Wildo,
Mathare Area 1, Drive Inn Pri Sch, Shell Petrol Stn, Naivas S/Mkt, Al

In [10]:
import pdfplumber
import re
import json
import csv
from pathlib import Path

# -------- CONFIG --------
PDF_FILE = "/kaggle/working/kplc_pdfs/01J1FA2JXK3QNX0PM6YAP0P3T2.pdf"   # change to your PDF filename
JSON_FILE = "01J1FA2JXK3QNX0PM6YAP0P3T2.json"
CSV_FILE = "01J1FA2JXK3QNX0PM6YAP0P3T2.csv"

# -------- REGEX PATTERNS --------
region_pattern = re.compile(r"(.*REGION)")
county_pattern = re.compile(r"PARTS OF ([A-Z ]+) COUNTY")
area_pattern = re.compile(r"AREA:\s*(.*)")
date_time_pattern = re.compile(r"DATE:\s*([A-Za-z]+\s*\d{1,2}\.\d{1,2}\.\d{4})\s*TIME:\s*([\d.:APM\s–-]+)")

def parse_pdf(pdf_file):
    records = []
    current_region = None
    current_county = None
    current_area = None
    current_date = None
    current_time = None
    current_customers = []

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            lines = text.split("\n")
            for line in lines:
                line = line.strip()

                # Detect region
                if region_pattern.match(line):
                    current_region = line.strip()
                    current_county = None  # reset
                    continue

                # Detect county
                county_match = county_pattern.search(line)
                if county_match:
                    current_county = county_match.group(1).title().strip()
                    continue

                # Detect area
                area_match = area_pattern.search(line)
                if area_match:
                    # Save previous notice if exists
                    if current_area:
                        records.append({
                            "region": current_region,
                            "county": current_county,
                            "area": current_area,
                            "date": current_date,
                            "time": current_time,
                            "customers": [c.strip() for c in current_customers if c.strip()]
                        })
                    # Start new notice
                    current_area = area_match.group(1).strip()
                    current_date, current_time = None, None
                    current_customers = []
                    continue

                # Detect date & time
                dt_match = date_time_pattern.search(line)
                if dt_match:
                    current_date = dt_match.group(1).strip()
                    current_time = dt_match.group(2).replace("–", "-").strip()
                    continue

                # Customers (long lists)
                if current_area and not line.startswith("DATE:") and not line.startswith("AREA:"):
                    current_customers.extend([c.strip() for c in line.split(",")])

    # Save last block
    if current_area:
        records.append({
            "region": current_region,
            "county": current_county,
            "area": current_area,
            "date": current_date,
            "time": current_time,
            "customers": [c.strip() for c in current_customers if c.strip()]
        })

    return records

def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def save_csv(data, filename):
    keys = ["region", "county", "area", "date", "time", "customers"]
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        for row in data:
            writer.writerow({
                "region": row["region"],
                "county": row["county"],
                "area": row["area"],
                "date": row["date"],
                "time": row["time"],
                "customers": "; ".join(row["customers"])  # flatten list
            })

# -------- RUN --------
records = parse_pdf(PDF_FILE)
print(f"Extracted {len(records)} notices")

save_json(records, JSON_FILE)
save_csv(records, CSV_FILE)
print(f"Saved {JSON_FILE} and {CSV_FILE}")


Extracted 26 notices
Saved 01J1FA2JXK3QNX0PM6YAP0P3T2.json and 01J1FA2JXK3QNX0PM6YAP0P3T2.csv
