In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os

def filter_spatial_data(file_path, area_column, areas):
    try:
        # Read the CSV file
        data = pd.read_csv(file_path)

        # Filter the data for the specified areas, checking if any part of the area name contains the names in the list
        filtered_data = data[data[area_column].apply(lambda x: any(area in x for area in areas))]
    except pd.errors.EmptyDataError:
        print(f"EmptyDataError: No data in file {file_path}")
        filtered_data = pd.DataFrame()  # Return an empty DataFrame

    return filtered_data

def process_and_save_filtered_data(base_folder, numb_list, save_folder):
    # Loop through the range of dataset subfolders
    for i in numb_list:
        subfolder = f'census2021-ts{i:03d}'
        subfolder_path = f'{base_folder}/{subfolder}/'
        print(f"Processing subfolder: {subfolder_path}")  # Log current subfolder

        # Define file paths for this subfolder and check if they exist before processing
        file_paths = {
            'lsoa': f'{subfolder_path}{subfolder}-lsoa.csv',
            'msoa': f'{subfolder_path}{subfolder}-msoa.csv',
            'utla': f'{subfolder_path}{subfolder}-utla.csv',
            'ltla': f'{subfolder_path}{subfolder}-ltla.csv'
        }

        for file_type, file_path in file_paths.items():
            print(f"Checking file: {file_path}")  # Log current file
            if os.path.exists(file_path):
                print(f"Filtering data from: {file_path}")  # Log filtering action
                filtered_data = filter_spatial_data(file_path, area_column_name, greater_manchester_areas + west_midlands_areas)

                # Define paths for saving filtered data
                save_subfolder_path = f'{save_folder}/{subfolder}/'
                if not os.path.exists(save_subfolder_path):
                    os.makedirs(save_subfolder_path)

                if not filtered_data.empty:
                    print(f"Saving filtered data to: {save_subfolder_path}{subfolder}-{file_type}-filtered.csv")  # Log saving action
                    filtered_data.to_csv(f'{save_subfolder_path}{subfolder}-{file_type}-filtered.csv', index=False)



In [None]:

# Constants
area_column_name = 'geography'
greater_manchester_areas = [
    'Wigan', 'Bolton', 'Salford', 'Bury', 'Rochdale', 'Oldham',
    'Manchester', 'Trafford', 'Stockport', 'Tameside'
]
west_midlands_areas = [
    'Wolverhampton', 'Dudley', 'Sandwell', 'Birmingham', 'Solihull',
    'Coventry', 'Walsall'
]

# Base folder and save folder configuration
base_folder = '/content/drive/MyDrive/TheMill /Collective_Census_2021_folder'
save_folder = '/content/drive/MyDrive/TheMill /Collective_Census_2021_folder_filtered'
numb_list = [1,2,3,4,5,6,7,8,9,10,11,15,16,17,18,19,20,21,41]
# Call to process and save data
process_and_save_filtered_data(base_folder, numb_list, save_folder)


Processing subfolder: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/
Checking file: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/census2021-ts001-lsoa.csv
Filtering data from: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/census2021-ts001-lsoa.csv
Saving filtered data to: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder_filtered/census2021-ts001/census2021-ts001-lsoa-filtered.csv
Checking file: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/census2021-ts001-msoa.csv
Filtering data from: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/census2021-ts001-msoa.csv
Saving filtered data to: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder_filtered/census2021-ts001/census2021-ts001-msoa-filtered.csv
Checking file: /content/drive/MyDrive/TheMill /Collective_Census_2021_folder/census2021-ts001/census2021-ts001