This notebook, accesses the centroid data and commuting data for England and wales at MSOA level for 2011 online. 
The data is then preprocessed and stored locally in parquet format. 

In [None]:
import requests
import geopandas as gpd
import pandas as pd
from io import StringIO
import json # For parsing JSON responses

# Feature server that provides the MSOA centroids for the 2011 Census
feature_server_url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/MSOA_Dec_2011_PWC_in_England_and_Wales_2022/FeatureServer/0"
query_url = f"{feature_server_url}/query"
metadata_url = feature_server_url # URL for server metadata

# Get metadata (Total Features and Max Record Count) ---
print("Fetching server metadata and total feature count...")
try:
    # Get server metadata (including maxRecordCount)
    meta_params = {'f': 'json'}
    meta_response = requests.get(metadata_url, params=meta_params)
    meta_response.raise_for_status()
    server_metadata = meta_response.json()
    # Get the actual max record count from the server if available
    max_records_per_request = server_metadata.get('maxRecordCount', 1000)
    print(f"Server's maximum records per request: {max_records_per_request}")

    # Get total feature count that matches the query (1=1 means all)
    count_params = {'where': '1=1', 'returnCountOnly': 'true', 'f': 'json'}
    count_response = requests.get(query_url, params=count_params)
    count_response.raise_for_status()
    count_data = count_response.json()
    total_features = count_data.get('count')
    if total_features is None:
        raise ValueError("Could not determine total feature count from server.")
    print(f"Total features to download: {total_features}")

except requests.exceptions.RequestException as e:
    print(f"FATAL ERROR: Could not fetch metadata or feature count: {e}")
    exit() # Stop execution if we can't get vital info
except (json.JSONDecodeError, ValueError, KeyError) as e:
    print(f"FATAL ERROR: Could not parse metadata or count response: {e}")
    exit()
except Exception as e:
    print(f"FATAL ERROR during metadata fetch: {e}")
    exit()

# Paginate and Fetch Data ---
all_features_gdfs = [] # List to hold GeoDataFrames from each request
offset = 0

print("\nStarting paginated download...")
while offset < total_features:
    current_batch_size = min(max_records_per_request, total_features - offset)
    print(f"  Fetching features {offset + 1} to {offset + current_batch_size} (of {total_features})...")

    params = {
        'where': '1=1',
        'outFields': '*',
        'f': 'geojson',
        'returnGeometry': 'true',
        'resultOffset': offset,
        'resultRecordCount': max_records_per_request # Ask for up to the max limit
        # Note: some servers might ignore resultRecordCount if it exceeds their internal limit
    }

    try:
        response = requests.get(query_url, params=params)
        response.raise_for_status()

        geojson_text = response.text
        if not geojson_text or geojson_text.isspace():
            print(f"Warning: Received empty response text at offset {offset}. Stopping.")
            break

        # Check if the response actually contains features before trying to read
        try:
            geojson_data_check = response.json()
            if 'features' not in geojson_data_check or not geojson_data_check['features']:
                 print(f"  -> No features found in response at offset {offset}. Server might have finished early or issue exists.")
                 # It's possible the total count was slightly off, or we've hit the end.
                 # Break cleanly assuming we got everything up to this point.
                 break
        except json.JSONDecodeError:
             print(f"Error: Received non-JSON response at offset {offset}. Content starts with: {geojson_text[:200]}...")
             break # Stop if response is not valid JSON

        # Read the GeoJSON chunk directly using GeoPandas
        gdf_chunk = gpd.read_file(StringIO(geojson_text))

        if gdf_chunk.empty:
            print(f"Warning: GeoDataFrame created from response at offset {offset} is empty, though response wasn't empty. Stopping.")
            break

        all_features_gdfs.append(gdf_chunk)
        num_returned_in_chunk = len(gdf_chunk)
        print(f"  -> Received {num_returned_in_chunk} features in this batch.")

        # Important: Increment offset by the number actually returned
        offset += num_returned_in_chunk

        # Safety break if server returns fewer records than requested but we haven't reached the total
        # (This might indicate the end of data) or if it somehow returns 0.
        if num_returned_in_chunk < max_records_per_request and offset < total_features:
             print("  -> Received fewer records than requested, assuming end of data.")
             # Sometimes the total count might be slightly off, or this is the last page
             # Update total_features to prevent potential infinite loop if count was wrong
             total_features = offset
             break # Break the loop as we likely got the last page
        elif num_returned_in_chunk == 0:
             print(f"  -> Received 0 features at offset {offset}. Stopping pagination.")
             break


    except requests.exceptions.RequestException as e:
        print(f"ERROR downloading data chunk at offset {offset}: {e}")
        print("Stopping further downloads due to error.")
        break # Stop on download error
    except Exception as e:
        print(f"ERROR processing data chunk at offset {offset}: {e}")
        print("Stopping further downloads due to error.")
        break # Stop on processing error (like bad GeoJSON)


# --- 3. Combine Results ---
centroid_data = gpd.GeoDataFrame() # Initialize an empty GeoDataFrame

if all_features_gdfs:
    print("\nCombining all downloaded feature batches...")
    try:
        # Concatenate all the collected GeoDataFrames
        # Ensure CRS is preserved from the first chunk (assuming all chunks have the same CRS)
        centroid_data = gpd.GeoDataFrame(
            pd.concat(all_features_gdfs, ignore_index=True),
            crs=all_features_gdfs[0].crs
        )
        print("All features combined successfully.")
        print(f"Total features in final GeoDataFrame: {len(centroid_data)}")
    except Exception as e:
        print(f"ERROR combining downloaded data: {e}")
        print("The resulting 'centroid_data' may be incomplete or empty.")
else:
    print("\nNo features were successfully downloaded or combined.")



In [None]:
import pandas as pd
import requests
import zipfile
import io

# URL of the zip file containing MSOA level 2011 Commuting Data for England and Wales, in particular the table called "WU03EW_V2"
zip_file_url = "https://s3-eu-west-1.amazonaws.com/statistics.digitalresources.jisc.ac.uk/dkan/files/FLOW/wu03ew_v2/wu03ew_v2.zip"

# Specific CSV filename 
target_csv_filename = "wu03ew_v2.csv"


try:
    # Download the file content using requests
    response = requests.get(zip_file_url)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
 
    # Open the zip file from the downloaded content in memory
    zip_content_bytes = io.BytesIO(response.content)

    # Use zipfile to open the archive and access the specific CSV
    with zipfile.ZipFile(zip_content_bytes, 'r') as z:
        available_files = z.namelist()
        if target_csv_filename in available_files:
            # 4. Read the specific CSV file content
            with z.open(target_csv_filename) as csv_file:
                commuting_data = pd.read_csv(csv_file)
        else:
            # File not found in archive
            raise FileNotFoundError(f"The specified file '{target_csv_filename}' was not found in the zip archive.")

    print(f"Successfully loaded '{target_csv_filename}' ")

except requests.exceptions.RequestException as req_err:
    print(f"Error during download: {req_err}")
except zipfile.BadZipFile as zip_err:
    print(f"Error: Downloaded file is not a valid zip file or is corrupted. {zip_err}")
except FileNotFoundError as fnf_err:
     print(f"Error: {fnf_err}") # Specific error for missing file
except pd.errors.ParserError as parse_err:
     print(f"Error parsing CSV file: {parse_err}. Check delimiter or file format.")
except UnicodeDecodeError as decode_err:
    print(f"Error decoding CSV file: {decode_err}. Try specifying the encoding, e.g., pd.read_csv(csv_file, encoding='latin1')")
except Exception as e_manual:
    print(f"An unexpected error occurred during : {e_manual}")

commuting_data = commuting_data.rename(columns={"All categories: Method of travel to work": "Commuters"})

# ignore commuting outside of England and Wales
commuting_data = commuting_data.loc[commuting_data["Area of workplace"].isin(centroid_data["msoa11cd"])]
commuting_data = commuting_data.loc[commuting_data["Area of residence"].isin(centroid_data["msoa11cd"])]

# only keep main columns
commuting_data = commuting_data[['Area of residence', 'Area of workplace', 'Commuters']]
commuting_data.head()

In [None]:
# Sum workplace and residential population
workplace_population = commuting_data[['Area of workplace','Commuters']].groupby('Area of workplace').sum()
residential_population = commuting_data[['Area of residence','Commuters']].groupby('Area of residence').sum()

centroid_data = pd.merge(centroid_data, workplace_population, left_on="msoa11cd", right_on='Area of workplace',  how='left')
centroid_data = centroid_data.rename(columns={"Commuters": "Workplace population"})
centroid_data = pd.merge(centroid_data, residential_population, left_on="msoa11cd",  right_on='Area of residence', how='left')
centroid_data = centroid_data.rename(columns={"Commuters": "Residential population"})
centroid_data = centroid_data[['msoa11cd','Workplace population','Residential population','geometry']]
centroid_data.head()

In [None]:
import os
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd

# Define the data folder name
data_folder = 'data'

# Create the data folder if it doesn't exist
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

centroid_file = os.path.join(data_folder, 'centroid_data.parquet')
commuting_file = os.path.join(data_folder, 'commuting_data.parquet')
centroid_data.to_parquet(centroid_file)
commuting_data.to_parquet(commuting_file)

