# Extract Laptop Specifications from Local PDFs

This notebook extracts laptop specifications from locally available PDF datasheets for Lenovo ThinkPad and HP ProBook models.

It uses `pdfplumber` for PDF parsing and regular expressions for information extraction.

In [1]:
# If running in a new environment, install required packages:
# !pip install pdfplumber requests

import os
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
import pdfplumber

## Data Class for Specifications

In [2]:
@dataclass
class LaptopSpecification:
    brand: str
    model: str
    processor: str
    memory: str
    storage: str
    display: str
    graphics: str
    operating_system: str
    dimensions: str
    weight: str
    ports: List[str]
    wireless: str
    battery: str
    warranty: str
    additional_features: Dict[str, Any] = None
    
    def to_dict(self) -> Dict[str, Any]:
        result = {k: v for k, v in self.__dict__.items() if not k.startswith('_')}
        if self.additional_features is None:
            result['additional_features'] = {}
        return result

## PDF Extractor Class (Local PDFs)

In [3]:
class PDFExtractor:
    """Extract laptop specifications from local PDF documents."""
    def __init__(self):
        pass
    
    def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
        text_content = []
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_content.append(text)
        return text_content
    
    def _extract_lenovo_thinkpad_specs(self, text_content: List[str]) -> LaptopSpecification:
        text = "\n".join(text_content)
        model_match = re.search(r'ThinkPad\s+(E\d+\s+Gen\s+\d+\s+\([^)]+\))', text)
        model = model_match.group(1) if model_match else "Unknown ThinkPad Model"
        processor_match = re.search(r'Processor\s+(.*?)(?:WWAN|Graphics|Memory)', text, re.DOTALL)
        processor = processor_match.group(1).strip() if processor_match else "Not specified"
        memory_match = re.search(r'Memory\s+(.*?)(?:Storage|Display)', text, re.DOTALL)
        memory = memory_match.group(1).strip() if memory_match else "Not specified"
        storage_match = re.search(r'Storage\s+(.*?)(?:Display|Optical)', text, re.DOTALL)
        storage = storage_match.group(1).strip() if storage_match else "Not specified"
        display_match = re.search(r'Display\s+(.*?)(?:Graphics|Camera)', text, re.DOTALL)
        display = display_match.group(1).strip() if display_match else "Not specified"
        graphics_match = re.search(r'Graphics\s+(.*?)(?:Camera|Audio)', text, re.DOTALL)
        graphics = graphics_match.group(1).strip() if graphics_match else "Not specified"
        os_match = re.search(r'Operating System\s+(.*?)(?:Pre-installed|Dimensions)', text, re.DOTALL)
        operating_system = os_match.group(1).strip() if os_match else "Not specified"
        dimensions_match = re.search(r'Dimensions \(H x W x D\)\s+(.*?)(?:Weight|Battery)', text, re.DOTALL)
        dimensions = dimensions_match.group(1).strip() if dimensions_match else "Not specified"
        weight_match = re.search(r'Weight\s+(.*?)(?:Battery|WLAN)', text, re.DOTALL)
        weight = weight_match.group(1).strip() if weight_match else "Not specified"
        ports_match = re.search(r'Ports / Slots\s+(.*?)(?:Security|Keyboard)', text, re.DOTALL)
        ports_text = ports_match.group(1).strip() if ports_match else ""
        ports = [port.strip() for port in ports_text.split('\n') if port.strip()]
        wireless_match = re.search(r'WLAN\s+(.*?)(?:WWAN|Bluetooth)', text, re.DOTALL)
        wireless = wireless_match.group(1).strip() if wireless_match else "Not specified"
        battery_match = re.search(r'Battery\s+(.*?)(?:Power Adapter|AC Adapter)', text, re.DOTALL)
        battery = battery_match.group(1).strip() if battery_match else "Not specified"
        warranty_match = re.search(r'Warranty\s+(.*?)(?:Note:|Accessories)', text, re.DOTALL)
        warranty = warranty_match.group(1).strip() if warranty_match else "Not specified"
        return LaptopSpecification(
            brand="Lenovo",
            model=f"Lenovo {model}",
            processor=processor,
            memory=memory,
            storage=storage,
            display=display,
            graphics=graphics,
            operating_system=operating_system,
            dimensions=dimensions,
            weight=weight,
            ports=ports,
            wireless=wireless,
            battery=battery,
            warranty=warranty,
            additional_features={}
        )

    def _extract_hp_probook_specs(self, text_content: List[str]) -> LaptopSpecification:
        text = "\n".join(text_content)
        model_match = re.search(r'(ProBook\s+.*?Notebook\s+PC)', text, re.IGNORECASE)
        model = model_match.group(1).strip() if model_match else "Unknown HP ProBook"
        processor_match = re.search(r'Processor(?:s)?\s+(.*?)(?:Chipset|Graphics)', text, re.DOTALL)
        processor = processor_match.group(1).strip() if processor_match else "Not specified"
        memory_match = re.search(r'Memory\s+(.*?)(?:Storage|Internal)', text, re.DOTALL)
        memory = memory_match.group(1).strip() if memory_match else "Not specified"
        storage_match = re.search(r'Storage\s+(.*?)(?:Display|Audio)', text, re.DOTALL)
        storage = storage_match.group(1).strip() if storage_match else "Not specified"
        display_match = re.search(r'Display\s+(.*?)(?:Graphics|Audio)', text, re.DOTALL)
        display = display_match.group(1).strip() if display_match else "Not specified"
        graphics_match = re.search(r'Graphics\s+(.*?)(?:Audio|Camera)', text, re.DOTALL)
        graphics = graphics_match.group(1).strip() if graphics_match else "Not specified"
        os_match = re.search(r'Operating system\s+(.*?)(?:Management|Power)', text, re.DOTALL)
        operating_system = os_match.group(1).strip() if os_match else "Not specified"
        dimensions_match = re.search(r'Dimensions \(W x D x H\)\s+(.*?)(?:Weight|Power)', text, re.DOTALL)
        dimensions = dimensions_match.group(1).strip() if dimensions_match else "Not specified"
        weight_match = re.search(r'Weight\s+(.*?)(?:Power|Battery)', text, re.DOTALL)
        weight = weight_match.group(1).strip() if weight_match else "Not specified"
        ports_text = ""
        ports_match = re.search(r'Ports and Connectors\s+(.*?)(?:Webcam|Camera|Input)', text, re.DOTALL)
        if ports_match:
            ports_text = ports_match.group(1).strip()
        ports = [port.strip() for port in ports_text.split('\n') if port.strip()]
        wireless_match = re.search(r'Wireless technologies\s+(.*?)(?:Ports|Camera)', text, re.DOTALL)
        wireless = wireless_match.group(1).strip() if wireless_match else "Not specified"
        battery_match = re.search(r'Battery type\s+(.*?)(?:Power|Camera)', text, re.DOTALL)
        battery = battery_match.group(1).strip() if battery_match else "Not specified"
        warranty_match = re.search(r'Warranty\s+(.*?)(?:Footnotes|Additional)', text, re.DOTALL)
        warranty = warranty_match.group(1).strip() if warranty_match else "Not specified"
        return LaptopSpecification(
            brand="HP",
            model=f"HP {model}",
            processor=processor,
            memory=memory,
            storage=storage,
            display=display,
            graphics=graphics,
            operating_system=operating_system,
            dimensions=dimensions,
            weight=weight,
            ports=ports,
            wireless=wireless,
            battery=battery,
            warranty=warranty,
            additional_features={}
        )

    def extract_specs(self, pdf_path: str) -> LaptopSpecification:
        text_content = self.extract_text_from_pdf(pdf_path)
        full_text = "\n".join(text_content)
        if "ThinkPad" in full_text:
            return self._extract_lenovo_thinkpad_specs(text_content)
        elif "ProBook" in full_text:
            return self._extract_hp_probook_specs(text_content)
        else:
            raise ValueError(f"Unsupported laptop brand/model in PDF: {pdf_path}")

## Example Usage: Extract Specs from Local PDFs

In [4]:
# Update this list with the paths to your local PDF files:
local_pdf_paths = [
    "./ThinkPad_E14_Gen_5_Intel_Spec.pdf",
    "./ThinkPad_E14_Gen_5_AMD_Spec.pdf",
    "./HP ProBook 440.pdf",
    "./HP ProBook 450.pdf"
]

extractor = PDFExtractor()
results = []

for pdf_path in local_pdf_paths:
    try:
        specs = extractor.extract_specs(pdf_path)
        results.append(specs)
        print(f"Successfully extracted specs for {specs.model}")
        print(f"Processor: {specs.processor}")
        print(f"Memory: {specs.memory}")
        print(f"Storage: {specs.storage}")
        print("-" * 50)
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")

Successfully extracted specs for Lenovo E14 Gen 5 (Intel)
Processor: Processor Family
13th Generation Intel® U, P or H Series Core i3 / i5 / i7 Processor
Processor**
Processor
Cores Threads Base Frequency Max Frequency Cache Processor
Memory: TGP Key Features
Intel® UHD Graphics Integrated Shared Share CPU TDP DirectX® 12.1
Intel® Iris® Xe Graphics[1] Integrated Shared Share CPU TDP DirectX® 12.1
NVIDIA® GeForce MX550 Discrete 2GB GDDR6 - DirectX® 12
ThinkPad E14 Gen 5 (Intel) - August 06 2025 2 of 10
PSREF
Product Specifications
ThinkPad E14 Gen 5 (Intel)
Reference
Notes:
[1] Intel® Iris® Xe Graphics capability requires system to be configured with dual-channel memory. On the system with
single-channel memory, Intel® Iris® Xe Graphics will function as Intel® UHD Graphics.
Monitor Support
Monitor Support
Supports up to 4 independent displays (native display and 3 external monitors via HDMI®, USB-C® and Thunderbolt™)
• HDMI® supports up to 4096x2160@60Hz
• USB-C® supports up to 4096x216

## Display All Extracted Specifications as DataFrame

In [5]:
import pandas as pd

specs_dicts = [spec.to_dict() for spec in results]
df = pd.DataFrame(specs_dicts)
df

Unnamed: 0,brand,model,processor,memory,storage,display,graphics,operating_system,dimensions,weight,ports,wireless,battery,warranty,additional_features
0,Lenovo,Lenovo E14 Gen 5 (Intel),"Processor Family\n13th Generation Intel® U, P ...",TGP Key Features\nIntel® UHD Graphics Integrat...,"Max Storage Support[1]\nUp to two drives, 2x M...",Display**[1]\nViewing\nAspect Contrast Color R...,Name\n6 (2 P-core + 4 E- P-core 1.2GHz / E-cor...,Operating System**\n• Windows® 11 Pro\n• Windo...,Not specified,"Aluminium (top), PC-ABS (bottom) models Starti...",[],+,Battery**[1]\n• 47Wh Rechargeable Li-ion Batte...,Base Warranty**\n• 1-year mail-in service\n• 1...,{}
1,Lenovo,Lenovo E14 Gen 5 (AMD),Processor Family\nAMD Ryzen™ 3 / 5 / 7 Process...,TGP Key Features\nAMD Radeon™ Graphics Integra...,"Max Storage Support[1]\nUp to two drives, 2x M...",Display**[1]\nViewing\nAspect Contrast Color R...,AMD Ryzen™ 3 7330U 4 8 2.3GHz 4.3GHz 2MB L2 / ...,Operating System**\n• Windows® 11 Pro\n• Windo...,Not specified,"Aluminium (top), PC-ABS (bottom) models Starti...",[],+,Battery**[1]\n• 47Wh Rechargeable Li-ion Batte...,Base Warranty**\n• 1-year mail-in service\n• 1...,{}
2,HP,HP ProBook 440 14 inch G11 Notebook\nPC,family5 Intel® Core™ Ultra 5 processor\nIntel®...,slots 2 SODIMM\n512 GB up to 1 TB PCIe® Gen4x4...,Not specified,"size (diagonal,\n35.6 cm (14"")\nmetric)\n14"" d...",Discrete: NVIDIA® GeForce RTX™ 2050 Laptop GPU...,Not specified,Not specified,(Weight will vary by configuration. Does not i...,[],Not specified,Battery is internal and not replaceable by cus...,coverage is also available. HP Care Pack Servi...,{}
3,HP,HP ProBook 450 15.6 inch G10 Notebook PC,family6 13th Generation Intel® Core™ i7 proces...,slots 2 SODIMM\nup to 1 TB PCIe® NVMe™ M.2 SSD...,Not specified,"size (diagonal, metric) 39.6 cm (15.6"")\n39.6 ...","(0.9 GHz E-core base frequency, 1.2 GHz P-core...",Not specified,Not specified,(Weight will vary by configuration. Does not i...,[],Not specified,Battery is internal and not replaceable by cus...,and extended coverage is also available. HP Ca...,{}


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

display(df)

In [None]:
#df.to_csv("laptop_specifications.csv", index=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                4 non-null      object
 1   model                4 non-null      object
 2   processor            4 non-null      object
 3   memory               4 non-null      object
 4   storage              4 non-null      object
 5   display              4 non-null      object
 6   graphics             4 non-null      object
 7   operating_system     4 non-null      object
 8   dimensions           4 non-null      object
 9   weight               4 non-null      object
 10  ports                4 non-null      object
 11  wireless             4 non-null      object
 12  battery              4 non-null      object
 13  warranty             4 non-null      object
 14  additional_features  4 non-null      object
dtypes: object(15)
memory usage: 612.0+ bytes


In [7]:
df.columns

Index(['brand', 'model', 'processor', 'memory', 'storage', 'display',
       'graphics', 'operating_system', 'dimensions', 'weight', 'ports',
       'wireless', 'battery', 'warranty', 'additional_features'],
      dtype='object')