## Data Extraction

Using the UK immigration APIs to get the rules relevant for the project

In [1]:
import requests
import pandas as pd

# Base URL for GOV.UK Content API
BASE_URL = "https://www.gov.uk/api/content"

# Immigration Rules entry page
IMMIGRATION_RULES_PATH = "/guidance/immigration-rules"

def fetch_content(path):
    """Fetch content JSON from GOV.UK Content API."""
    url = BASE_URL + path
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

# Step 1: Get Immigration Rules page
immigration_data = fetch_content(IMMIGRATION_RULES_PATH)

# Step 2: Explore top-level keys
print("Top-level keys:", immigration_data.keys())

# Step 3: Get page details
title = immigration_data.get("title")
description = immigration_data.get("description")
document_type = immigration_data.get("document_type")
print(f"Title: {title}\nType: {document_type}\nDescription: {description}")

# Step 4: Get links to sub-pages
links = immigration_data.get("links", {})
subpages = links.get("ordered_related_items", [])  # Many rules fall under this category

print(f"\nFound {len(subpages)} related sub-pages.")
for sp in subpages[:10]:  # print first 10
    print(f"- {sp['title']} -> {sp['api_path']}")

# Step 5: Convert sub-pages into a DataFrame for exploration
subpage_data = [
    {
        "title": sp.get("title"),
        "description": sp.get("description"),
        "path": sp.get("base_path"),
        "api_url": "https://www.gov.uk" + sp.get("api_path", "")
    }
    for sp in subpages
]

df = pd.DataFrame(subpage_data)
df.head()


Top-level keys: dict_keys(['analytics_identifier', 'base_path', 'content_id', 'description', 'details', 'document_type', 'first_published_at', 'links', 'locale', 'phase', 'public_updated_at', 'publishing_app', 'publishing_request_id', 'publishing_scheduled_at', 'rendering_app', 'scheduled_publishing_delay_seconds', 'schema_name', 'title', 'updated_at', 'withdrawn_notice'])
Title: Immigration Rules
Type: manual
Description: A collection of the current Immigration Rules.

Found 0 related sub-pages.


In [2]:
# Inspect all available link types
print("Available link types:", immigration_data.get("links", {}).keys())

# Look for sections or documents
sections = immigration_data["links"].get("sections", [])
documents = immigration_data["links"].get("documents", [])

print(f"\nFound {len(sections)} sections and {len(documents)} documents.")

# Preview sections
for sec in sections[:5]:
    print(f"SECTION: {sec['title']} -> {sec['api_path']}")

# Preview documents (actual rule pages)
for doc in documents[:5]:
    print(f"DOCUMENT: {doc['title']} -> {doc['api_path']}")


Available link types: dict_keys(['available_translations', 'document_collections', 'organisations', 'primary_publishing_organisation', 'sections', 'suggested_ordered_related_items', 'taxons'])

Found 105 sections and 0 documents.
SECTION: Immigration Rules: Index -> /api/content/guidance/immigration-rules/immigration-rules-index
SECTION: Immigration Rules: introduction -> /api/content/guidance/immigration-rules/immigration-rules-introduction
SECTION: Immigration Rules part 1: leave to enter or stay in the UK -> /api/content/guidance/immigration-rules/immigration-rules-part-1-leave-to-enter-or-stay-in-the-uk
SECTION: Immigration Rules part 2: transitional provisions  -> /api/content/guidance/immigration-rules/immigration-rules-part-2-transitional-provisions
SECTION: Immigration Rules part 3: students -> /api/content/guidance/immigration-rules/immigration-rules-part-3-students


In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

BASE_URL = "https://www.gov.uk"

data_dir = Path("../data")

def fetch_content(path):
    url = BASE_URL + path
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def clean_html(html_content):
    """Convert HTML body to plain text."""
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text(separator="\n").strip()

# Get sections from Immigration Rules base page
sections = immigration_data["links"].get("sections", [])
print(f"Found {len(sections)} sections.")

# Collect data for each section
data = []
for sec in sections:  # limit to first 10 for testing, remove [:10] later
    section_title = sec.get("title")
    section_path = sec.get("api_path")

    try:
        # Fetch section details
        sec_data = fetch_content(section_path)
        # Extract and clean body text
        body_html = sec_data.get("details", {}).get("body", "")
        body_text = clean_html(body_html) if body_html else ""
        # Append to data list
        data.append({
            "title": section_title,
            "path": section_path,
            "text": body_text
        })
        print(f"\u2705 Fetched: {section_title}")
    except Exception as e:
        print(f"\u274C Failed: {section_title} ({e})")

# Convert to DataFrame
df = pd.DataFrame(data)
# Save to CSV and JSON
df.to_csv(data_dir/"immigration_rules.csv", index=False)
df.to_json(data_dir/"immigration_rules.json", orient="records", indent=2)

print("\n\u2705 All sections saved to immigration_rules.csv and immigration_rules.json")


Found 105 sections.
✅ Fetched: Immigration Rules: Index
✅ Fetched: Immigration Rules: introduction
✅ Fetched: Immigration Rules part 1: leave to enter or stay in the UK
✅ Fetched: Immigration Rules part 2: transitional provisions 
✅ Fetched: Immigration Rules part 3: students
✅ Fetched: Immigration Rules part 4: work experience
✅ Fetched: Immigration Rules part 5: working in the UK
✅ Fetched: Immigration Rules part 6: self-employment and business people
✅ Fetched: Immigration Rules part 6A: the points-based system
✅ Fetched: Immigration Rules part 7: other categories
✅ Fetched: Immigration Rules part 8: family members
✅ Fetched: Immigration Rules part 9: grounds for refusal
✅ Fetched: Immigration Rules part 11: asylum
✅ Fetched: Immigration Rules part 11A: temporary protection
✅ Fetched: Immigration Rules part 11B
✅ Fetched: Immigration Rules part 12: Procedure and rights of appeal
✅ Fetched: Immigration Rules part 13: deportation
✅ Fetched: Immigration Rules part 14: stateless persons

In [4]:
df.head()

Unnamed: 0,title,path,text
0,Immigration Rules: Index,/api/content/guidance/immigration-rules/immigr...,Paragraph number\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,Immigration Rules: introduction,/api/content/guidance/immigration-rules/immigr...,The Home Secretary has made changes in the Rul...
2,Immigration Rules part 1: leave to enter or st...,/api/content/guidance/immigration-rules/immigr...,Leave to enter the United Kingdom\n\n\n\n\n\n\...
3,Immigration Rules part 2: transitional provisi...,/api/content/guidance/immigration-rules/immigr...,Transitional provisions Part 2 and Appendix V:...
4,Immigration Rules part 3: students,/api/content/guidance/immigration-rules/immigr...,Persons seeking to enter the UK for short-term...


### Creating Corpus

In [5]:
# load data from saved file

# df = pd.read_json("immigration_rules.json")

# check if 'clean_text' column exists, else use 'text'
text_col = "clean_text" if "clean_text" in df.columns else "text"

# Derive human-viewable web_url from API path
def api_to_web_url(api_path: str) -> str:
    # /api/content/guidance/immigration-rules/...  -> https://www.gov.uk/guidance/immigration-rules/...
    base_path = api_path.replace("/api/content", "")
    return "https://www.gov.uk" + base_path

df["web_url"] = df["path"].apply(api_to_web_url)
df = df[[ "title", "path", "web_url", text_col ]].rename(columns={text_col: "text"})
df.head(3)


Unnamed: 0,title,path,web_url,text
0,Immigration Rules: Index,/api/content/guidance/immigration-rules/immigr...,https://www.gov.uk/guidance/immigration-rules/...,Paragraph number\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,Immigration Rules: introduction,/api/content/guidance/immigration-rules/immigr...,https://www.gov.uk/guidance/immigration-rules/...,The Home Secretary has made changes in the Rul...
2,Immigration Rules part 1: leave to enter or st...,/api/content/guidance/immigration-rules/immigr...,https://www.gov.uk/guidance/immigration-rules/...,Leave to enter the United Kingdom\n\n\n\n\n\n\...


In [6]:
import re
from typing import List, Dict
import numpy as np

# Normalize whitespace
def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

# Simple chunking function
def chunk_text(text: str, max_tokens: int = 500, overlap: int = 80) -> List[str]:
    """
    Simple word-based chunking (approx tokenization).
    Splits text into chunks of max_tokens with specified overlap.
    Adjust sizes if you see context getting cut oddly.
    """
    # normalize whitespace and split into words
    words = normalize_whitespace(text).split()
    chunks = []
    i = 0
    # run loop to create chunks with overlap
    while i < len(words):
        # create chunk
        chunk = words[i:i+max_tokens]
        if not chunk:
            break
        # append chunk
        chunks.append(" ".join(chunk))
        i += (max_tokens - overlap)
    return chunks

# Build a chunk table with source mapping
# create 480 token chunks with 60 token overlap
# records is a list of dicts with keys: doc_title, web_url, chunk_id, chunk_text

records: List[Dict] = []
for _, row in df.iterrows():
    # chunk text
    chunks = chunk_text(row["text"], max_tokens=480, overlap=60)

    # append to records
    for j, ch in enumerate(chunks):
        records.append({
            "doc_title": row["title"],
            "web_url": row["web_url"],
            "chunk_id": j,
            "chunk_text": ch
        })

corpus = pd.DataFrame(records)


In [7]:
corpus

Unnamed: 0,doc_title,web_url,chunk_id,chunk_text
0,Immigration Rules: Index,https://www.gov.uk/guidance/immigration-rules/...,0,Paragraph number Introduction (Paragraphs 1 to...
1,Immigration Rules: Index,https://www.gov.uk/guidance/immigration-rules/...,1,or have had leave to enter or remain under par...
2,Immigration Rules: Index,https://www.gov.uk/guidance/immigration-rules/...,2,Victims of domestic violence 289A to 289D Fian...
3,Immigration Rules: Index,https://www.gov.uk/guidance/immigration-rules/...,3,DELETED Appendix SN: Service of notices Append...
4,Immigration Rules: introduction,https://www.gov.uk/guidance/immigration-rules/...,0,The Home Secretary has made changes in the Rul...
...,...,...,...,...
1095,Immigration Rules Appendix Children,https://www.gov.uk/guidance/immigration-rules/...,1,life. Care Requirement CHI 2.1. If the applica...
1096,Immigration Rules Appendix Children,https://www.gov.uk/guidance/immigration-rules/...,2,entry clearance or permission to stay with the...
1097,Immigration Rules Appendix Children,https://www.gov.uk/guidance/immigration-rules/...,3,must be being granted settlement at the same t...
1098,Immigration Rules Appendix Tuberculosis (TB),https://www.gov.uk/guidance/immigration-rules/...,0,TB1. A person must provide a valid TB certific...


### Embedding

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Convert your previous embeddings + corpus into LangChain format
texts = corpus["chunk_text"].tolist()
metadatas = [{"title": corpus.loc[i, "doc_title"], "url": corpus.loc[i, "web_url"]} for i in range(len(corpus))]

# LangChain embedding model (small, CPU-friendly)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store
faiss_index = FAISS.from_texts(texts=texts, embedding=embedding_model, metadatas=metadatas)


### Retrieve

In [9]:
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA

llm = OllamaLLM(model="mistral:instruct")



qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",          
    retriever=faiss_index.as_retriever(search_kwargs={"k": 5}),  # top 5 chunks
    return_source_documents=True  
)


query = "What are the requirements for a Skilled Worker visa?"
result = qa_chain.invoke({"query": query})

# Get the answer
print(result['result'])

# Optional: view the source chunks retrieved
for doc in result['source_documents']:
    print(doc.metadata['title'], doc.metadata['url'])
    print(doc.page_content[:300], "...\n")




 A Skilled Worker visa application requires the following:

1. Apply online on the specified form, either "Skilled Worker visa" (for applicants outside the UK) or "Skilled Worker" (for applicants inside the UK).
2. Meet all validity requirements:
   - Payment of any fee and Immigration Health Charge.
   - Providing biometrics when required.
   - Submitting a passport or other travel document that establishes identity and nationality.
   - Having a certificate of sponsorship issued by the sponsor no more than 3 months (90 days if linked to a Sponsorship Reference Number) before the date of application.
   - The applicant must be aged 18 or over on the date of application.
   - If applying for permission to stay, the applicant must be in the UK on the date of application and not have certain types of permission (as a Visitor, Short-term student, Parent of a Child Student, Seasonal Worker, Domestic Worker in a Private Household, or outside the Immigration Rules).
   - If applying for entr