## Data Extraction

Using the UK immigration APIs to get the rules relevant for the project

In [36]:
import requests
import pandas as pd

# Base URL for GOV.UK Content API
BASE_URL = "https://www.gov.uk/api/content"

# Immigration Rules entry page
IMMIGRATION_RULES_PATH = "/guidance/immigration-rules"

def fetch_content(path):
    """Fetch content JSON from GOV.UK Content API."""
    url = BASE_URL + path
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

# Step 1: Get Immigration Rules page
immigration_data = fetch_content(IMMIGRATION_RULES_PATH)

# Step 2: Explore top-level keys
print("Top-level keys:", immigration_data.keys())

# Step 3: Get page details
title = immigration_data.get("title")
description = immigration_data.get("description")
document_type = immigration_data.get("document_type")
print(f"Title: {title}\nType: {document_type}\nDescription: {description}")

# Step 4: Get links to sub-pages
links = immigration_data.get("links", {})
subpages = links.get("ordered_related_items", [])  # Many rules fall under this category

print(f"\nFound {len(subpages)} related sub-pages.")
for sp in subpages[:10]:  # print first 10
    print(f"- {sp['title']} -> {sp['api_path']}")

# Step 5: Convert sub-pages into a DataFrame for exploration
subpage_data = [
    {
        "title": sp.get("title"),
        "description": sp.get("description"),
        "path": sp.get("base_path"),
        "api_url": "https://www.gov.uk" + sp.get("api_path", "")
    }
    for sp in subpages
]

df = pd.DataFrame(subpage_data)
df.head()


Top-level keys: dict_keys(['analytics_identifier', 'base_path', 'content_id', 'description', 'details', 'document_type', 'first_published_at', 'links', 'locale', 'phase', 'public_updated_at', 'publishing_app', 'publishing_request_id', 'publishing_scheduled_at', 'rendering_app', 'scheduled_publishing_delay_seconds', 'schema_name', 'title', 'updated_at', 'withdrawn_notice'])
Title: Immigration Rules
Type: manual
Description: A collection of the current Immigration Rules.

Found 0 related sub-pages.


In [37]:
# Inspect all available link types
print("Available link types:", immigration_data.get("links", {}).keys())

# Look for sections or documents
sections = immigration_data["links"].get("sections", [])
documents = immigration_data["links"].get("documents", [])

print(f"\nFound {len(sections)} sections and {len(documents)} documents.")

# Preview sections
for sec in sections[:5]:
    print(f"SECTION: {sec['title']} -> {sec['api_path']}")

# Preview documents (actual rule pages)
for doc in documents[:5]:
    print(f"DOCUMENT: {doc['title']} -> {doc['api_path']}")


Available link types: dict_keys(['available_translations', 'document_collections', 'organisations', 'primary_publishing_organisation', 'sections', 'suggested_ordered_related_items', 'taxons'])

Found 105 sections and 0 documents.
SECTION: Immigration Rules: Index -> /api/content/guidance/immigration-rules/immigration-rules-index
SECTION: Immigration Rules: introduction -> /api/content/guidance/immigration-rules/immigration-rules-introduction
SECTION: Immigration Rules part 1: leave to enter or stay in the UK -> /api/content/guidance/immigration-rules/immigration-rules-part-1-leave-to-enter-or-stay-in-the-uk
SECTION: Immigration Rules part 2: transitional provisions  -> /api/content/guidance/immigration-rules/immigration-rules-part-2-transitional-provisions
SECTION: Immigration Rules part 3: students -> /api/content/guidance/immigration-rules/immigration-rules-part-3-students


In [38]:
for key in data['links'].keys():
    print(key, ":", type(data['links'][key]))

TypeError: list indices must be integers or slices, not str

In [46]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

BASE_URL = "https://www.gov.uk"

def fetch_content(path):
    url = BASE_URL + path
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def clean_html(html_content):
    """Convert HTML body to plain text."""
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text(separator="\n").strip()

# Get sections from Immigration Rules base page
sections = immigration_data["links"].get("sections", [])
print(f"Found {len(sections)} sections.")

# Collect data for each section
data = []
for sec in sections:  # limit to first 10 for testing, remove [:10] later
    section_title = sec.get("title")
    section_path = sec.get("api_path")

    try:
        sec_data = fetch_content(section_path)
        body_html = sec_data.get("details", {}).get("body", "")
        body_text = clean_html(body_html) if body_html else ""
        
        data.append({
            "title": section_title,
            "path": section_path,
            "text": body_text
        })
        print(f"✅ Fetched: {section_title}")
    except Exception as e:
        print(f"❌ Failed: {section_title} ({e})")

# Convert to DataFrame
df = pd.DataFrame(data)
# Save to CSV and JSON
df.to_csv(data_dir/"immigration_rules.csv", index=False)
df.to_json(data_dir/"immigration_rules.json", orient="records", indent=2)

print("\n✅ All sections saved to immigration_rules.csv and immigration_rules.json")


Found 105 sections.
✅ Fetched: Immigration Rules: Index
✅ Fetched: Immigration Rules: introduction
✅ Fetched: Immigration Rules part 1: leave to enter or stay in the UK
✅ Fetched: Immigration Rules part 2: transitional provisions 
✅ Fetched: Immigration Rules part 3: students
✅ Fetched: Immigration Rules part 4: work experience
✅ Fetched: Immigration Rules part 5: working in the UK
✅ Fetched: Immigration Rules part 6: self-employment and business people
✅ Fetched: Immigration Rules part 6A: the points-based system
✅ Fetched: Immigration Rules part 7: other categories
✅ Fetched: Immigration Rules part 8: family members
✅ Fetched: Immigration Rules part 9: grounds for refusal
✅ Fetched: Immigration Rules part 11: asylum
✅ Fetched: Immigration Rules part 11A: temporary protection
✅ Fetched: Immigration Rules part 11B
✅ Fetched: Immigration Rules part 12: Procedure and rights of appeal
✅ Fetched: Immigration Rules part 13: deportation
✅ Fetched: Immigration Rules part 14: stateless persons

In [52]:
pprint.pprint(df.iloc[2].text)

('Leave to enter the United Kingdom\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '\n'
 '7.A person who is neither a British citizen nor a Commonwealth citizen with '
 'the right of abode nor a person who is entitled to enter or remain in the '
 'United Kingdom by virtue of section 3ZA of the Immigration Act 1971  '
 'requires leave to enter the United Kingdom.\n'
 '\n'
 '\n'
 '8.Under Sections 3 and 4 of the Immigration Act 1971 an Immigration Officer '
 'when admitting  to the United Kingdom a person subject to immigration '
 'control under that Act may give leave to enter for a limited period and, if '
 'he does, may impose all or any of the following conditions:\n'
 '      \n'
 '\n'
 '\n'
 '(i) a condition restricting employment or occupation in the United Kingdom;\n'
 '\n'
 '\n'
 '(ii) a condition requiring the person to maintain and accommodate himself, '
 'and any dependants of his, without recourse to public funds;\n'
 '\n'
 '\n'
 '(iii) a condition requiring the person to register with th