In [None]:
import os
print(os.getcwd())

import pandas as pd

parquet_path = "data/01-raw_data/public_transport_access.parquet"
transit_data = pd.read_parquet(parquet_path)

/Users/arusansurendiran/Documents/GitHub/CanadianAccessibilityEmployment


### **clean_data.py**

In [None]:
transit_data.columns

Index(['REF_DATE', 'GEO', 'DGUID',
       'Distance-capacity public transit service area', 'Location', 'Gender',
       'Demographic and socio-economic',
       'Sustainable Development Goals (SDGs) 11.2.1 indicator', 'UOM',
       'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',
       'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')

In [None]:
import pandas as pd

# Assuming you have already loaded your data into 'transit_data'
# If not, uncomment the following line:
# transit_data = pd.read_csv('public_transport_access.csv')

for col in transit_data.columns:
    unique_vals = transit_data[col].unique()
    num_unique = len(unique_vals)
    
    print(f"--- {col} ({num_unique} unique values) ---")
    
    if num_unique < 50:
        print(unique_vals)
    else:
        # For columns with many unique values, show just the first 10
        print(unique_vals[:10])
        print(f"... and {num_unique - 10} more")
    print("\n")

--- REF_DATE (1 unique values) ---
[2024]


--- GEO (42 unique values) ---
["St. John's, Census metropolitan area (CMA)"
 'Halifax, Census metropolitan area (CMA)'
 'Fredericton, Census metropolitan area (CMA)'
 'Moncton, Census metropolitan area (CMA)'
 'Saint John, Census metropolitan area (CMA)'
 'Drummondville, Census metropolitan area (CMA)'
 'Montréal, Census metropolitan area (CMA)'
 'Ottawa - Gatineau (Quebec part), Census metropolitan area (CMA)'
 'Québec, Census metropolitan area (CMA)'
 'Saguenay, Census metropolitan area (CMA)'
 'Sherbrooke, Census metropolitan area (CMA)'
 'Trois-Rivières, Census metropolitan area (CMA)'
 'Barrie, Census metropolitan area (CMA)'
 'Belleville - Quinte West, Census metropolitan area (CMA)'
 'Brantford, Census metropolitan area (CMA)'
 'Greater Sudbury, Census metropolitan area (CMA)'
 'Guelph, Census metropolitan area (CMA)'
 'Hamilton, Census metropolitan area (CMA)'
 'Kingston, Census metropolitan area (CMA)'
 'Kitchener - Cambridge - Wate

In [None]:
# 1. Identify missing values in GEO
missing_mask = transit_data['VALUE'].isna()

# Get the list of cities (GEOs) that have at least one missing value
# unique() gives us the list of names
cmas_to_drop = transit_data.loc[missing_mask, 'GEO'].unique()
print("CMAs with missing values in VALUE:")
print(cmas_to_drop)


transit_data = transit_data[~transit_data['GEO'].isin(cmas_to_drop)].copy()

CMAs with missing values in VALUE:
['Saguenay, Census metropolitan area (CMA)']


In [None]:
cols_to_keep = [
    "GEO",
    "DGUID",
    "Distance-capacity public transit service area",
    "Demographic and socio-economic",
    "Sustainable Development Goals (SDGs) 11.2.1 indicator",
    "UOM",
    "VALUE"
]
transit_data = transit_data[cols_to_keep].copy()

In [None]:
transit_data = transit_data.rename(columns={
    "GEO": "CMA",
    "DGUID": "CMA_ID",
    "Distance-capacity public transit service area": "Transit_Distance_Category",
    "Demographic and socio-economic": "Profile_Characteristic",
    "Sustainable Development Goals (SDGs) 11.2.1 indicator": "Population_Measure",
    "UOM": "Population_Measure_Unit",
    "VALUE": "Population_Value"
})
transit_data.head()

# Convert 'CMA' to string, remove ", Census metropolitan area (CMA)" to make it cleaner
transit_data['CMA'] = transit_data['CMA'].str.replace(", Census metropolitan area (CMA)", "", regex=False)

## **download_data.py** for unemploment

In [None]:
import pandas as pd


parquet_path = "data/01-raw_data/labour_rates.parquet"
labour_data = pd.read_parquet(parquet_path)

In [None]:
# Get unique IDs from each DataFrame
transit_ids = set(transit_data["CMA_ID"].unique())
labour_ids = set(labour_data["DGUID"].unique())

# Find common and exclusive IDs
common_ids = transit_ids & labour_ids
only_in_transit = transit_ids - labour_ids
only_in_labour = labour_ids - transit_ids

print(f"Common CMA_IDs: {len(common_ids)}")
print(f"Only in transit_data: {len(only_in_transit)}")
print(f"Only in labour_data: {len(only_in_labour)}")

# Optionally, print the actual IDs
#print("IDs only in transit_data:", only_in_transit)
#print("IDs only in labour_data:", only_in_labour)

Common CMA_IDs: 40
Only in transit_data: 1
Only in labour_data: 3


In [None]:
# Only in transit_data
print("CMA_IDs only in transit_data (not in labour_data):")
print(transit_data[transit_data["CMA_ID"].isin(only_in_transit)][["CMA_ID", "CMA"]].drop_duplicates())

# Only in labour_data
print("\nDGUIDs only in labour_data (not in transit_data):")
print(labour_data[labour_data["DGUID"].isin(only_in_labour)][["DGUID", "GEO"]].drop_duplicates())

# Present in both
print("\nCMA_IDs/DGUIDs present in both datasets:")
print(transit_data[transit_data["CMA_ID"].isin(common_ids)][["CMA_ID", "CMA"]].drop_duplicates())

CMA_IDs only in transit_data (not in labour_data):
         CMA_ID         CMA
0  2021S0503001  St. John's

DGUIDs only in labour_data (not in transit_data):
           DGUID                                    GEO
0     2021S05031  St. John's, Newfoundland and Labrador
35  2021S0503408                       Saguenay, Quebec
77  2021S0503505        Ottawa-Gatineau, Ontario/Quebec

CMA_IDs/DGUIDs present in both datasets:
              CMA_ID                               CMA
88      2021S0503205                           Halifax
176     2021S0503320                       Fredericton
264     2021S0503305                           Moncton
352     2021S0503310                        Saint John
440     2021S0503447                     Drummondville
528     2021S0503462                          Montréal
616   2021S050524505   Ottawa - Gatineau (Quebec part)
704     2021S0503421                            Québec
880     2021S0503433                        Sherbrooke
968     2021S0503442      

To ensure consistency between datasets, we identified that the DGUID ‘2021S05031’ in the labour data corresponded to St. John’s, matching the CMA_ID ‘2021S0503001’ in the transit data. Based on this partial match in city names, we manually replaced the DGUID in the labour data to enable accurate merging.

In [None]:
# Replace DGUID value in labour_data
labour_data['DGUID'] = labour_data['DGUID'].replace('2021S05031', '2021S0503001')

No data was found for Saguenay, Quebec (DGUID ‘2021S0503408’) in the transit dataset, so this city was excluded from the analysis. 

For Ottawa-Gatineau, Ontario/Quebec (DGUID ‘2021S0503505’), the city is split into Ontario and Quebec parts in the transit data, which are already represented as separate entries; therefore, the combined DGUID was not used.

We drop this DGUID from the labour data to maintain consistency across datasets.


In [None]:
to_drop = ['2021S0503408', '2021S0503505']
labour_data = labour_data[~labour_data['DGUID'].isin(to_drop)].copy()

In [None]:
# Get unique IDs from each DataFrame
transit_ids = set(transit_data["CMA_ID"].unique())
labour_ids = set(labour_data["DGUID"].unique())

# Find common and exclusive IDs
common_ids = transit_ids & labour_ids
only_in_transit = transit_ids - labour_ids
only_in_labour = labour_ids - transit_ids

# Only in transit_data
print("CMA_IDs only in transit_data (not in labour_data):")
print(transit_data[transit_data["CMA_ID"].isin(only_in_transit)][["CMA_ID", "CMA"]].drop_duplicates())

# Only in labour_data
print("\nDGUIDs only in labour_data (not in transit_data):")
print(labour_data[labour_data["DGUID"].isin(only_in_labour)][["DGUID", "GEO"]].drop_duplicates())

# Present in both
print("\nCMA_IDs/DGUIDs present in both datasets:")
print(transit_data[transit_data["CMA_ID"].isin(common_ids)][["CMA_ID", "CMA"]].drop_duplicates())

CMA_IDs only in transit_data (not in labour_data):
Empty DataFrame
Columns: [CMA_ID, CMA]
Index: []

DGUIDs only in labour_data (not in transit_data):
Empty DataFrame
Columns: [DGUID, GEO]
Index: []

CMA_IDs/DGUIDs present in both datasets:
              CMA_ID                               CMA
0       2021S0503001                        St. John's
88      2021S0503205                           Halifax
176     2021S0503320                       Fredericton
264     2021S0503305                           Moncton
352     2021S0503310                        Saint John
440     2021S0503447                     Drummondville
528     2021S0503462                          Montréal
616   2021S050524505   Ottawa - Gatineau (Quebec part)
704     2021S0503421                            Québec
880     2021S0503433                        Sherbrooke
968     2021S0503442                    Trois-Rivières
1056    2021S0503568                            Barrie
1144    2021S0503522          Belleville - Q