## Data Extraction

Using the UK immigration APIs to get the rules relevant for the project

In [32]:
import requests
import pandas as pd

# Base URL for GOV.UK Content API
BASE_URL = "https://www.gov.uk/api/content"

# Immigration Rules entry page
IMMIGRATION_RULES_PATH = "/guidance/immigration-rules"

def fetch_content(path):
    """Fetch content JSON from GOV.UK Content API."""
    url = BASE_URL + path
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

# Step 1: Get Immigration Rules page
immigration_data = fetch_content(IMMIGRATION_RULES_PATH)

# Step 2: Explore top-level keys
print("Top-level keys:", immigration_data.keys())

# Step 3: Get page details
title = immigration_data.get("title")
description = immigration_data.get("description")
document_type = immigration_data.get("document_type")
print(f"Title: {title}\nType: {document_type}\nDescription: {description}")

# Step 4: Get links to sub-pages
links = immigration_data.get("links", {})
subpages = links.get("ordered_related_items", [])  # Many rules fall under this category

print(f"\nFound {len(subpages)} related sub-pages.")
for sp in subpages[:10]:  # print first 10
    print(f"- {sp['title']} -> {sp['api_path']}")

# Step 5: Convert sub-pages into a DataFrame for exploration
subpage_data = [
    {
        "title": sp.get("title"),
        "description": sp.get("description"),
        "path": sp.get("base_path"),
        "api_url": "https://www.gov.uk" + sp.get("api_path", "")
    }
    for sp in subpages
]

df = pd.DataFrame(subpage_data)
df.head()


Top-level keys: dict_keys(['analytics_identifier', 'base_path', 'content_id', 'description', 'details', 'document_type', 'first_published_at', 'links', 'locale', 'phase', 'public_updated_at', 'publishing_app', 'publishing_request_id', 'publishing_scheduled_at', 'rendering_app', 'scheduled_publishing_delay_seconds', 'schema_name', 'title', 'updated_at', 'withdrawn_notice'])
Title: Immigration Rules
Type: manual
Description: A collection of the current Immigration Rules.

Found 0 related sub-pages.


In [33]:
# Inspect all available link types
print("Available link types:", immigration_data.get("links", {}).keys())

# Look for sections or documents
sections = immigration_data["links"].get("sections", [])
documents = immigration_data["links"].get("documents", [])

print(f"\nFound {len(sections)} sections and {len(documents)} documents.")

# Preview sections
for sec in sections[:5]:
    print(f"SECTION: {sec['title']} -> {sec['api_path']}")

# Preview documents (actual rule pages)
for doc in documents[:5]:
    print(f"DOCUMENT: {doc['title']} -> {doc['api_path']}")


Available link types: dict_keys(['available_translations', 'document_collections', 'organisations', 'primary_publishing_organisation', 'sections', 'suggested_ordered_related_items', 'taxons'])

Found 105 sections and 0 documents.
SECTION: Immigration Rules: Index -> /api/content/guidance/immigration-rules/immigration-rules-index
SECTION: Immigration Rules: introduction -> /api/content/guidance/immigration-rules/immigration-rules-introduction
SECTION: Immigration Rules part 1: leave to enter or stay in the UK -> /api/content/guidance/immigration-rules/immigration-rules-part-1-leave-to-enter-or-stay-in-the-uk
SECTION: Immigration Rules part 2: transitional provisions  -> /api/content/guidance/immigration-rules/immigration-rules-part-2-transitional-provisions
SECTION: Immigration Rules part 3: students -> /api/content/guidance/immigration-rules/immigration-rules-part-3-students


In [25]:
for key in data['links'].keys():
    print(key, ":", type(data['links'][key]))

available_translations : <class 'list'>
document_collections : <class 'list'>
organisations : <class 'list'>
primary_publishing_organisation : <class 'list'>
sections : <class 'list'>
suggested_ordered_related_items : <class 'list'>
taxons : <class 'list'>
