Data Processing & Feature Engineering

In [16]:
import pandas as pd
import numpy as np
from google.colab import drive

In [17]:
# Mount Google Drive
drive.mount('/content/drive')

# --- Configuration ---
ASTHMA_DATA_PATH = "/content/drive/My Drive/Big Data Project/Data/Raw/asthma_dataset/Asthma Prevalance_Data_2020_2023.csv"
POLLUTANT_DATA_PATH = "/content/drive/My Drive/Big Data Project/Data/Processed/all_pollutants_merged_inner.csv"
OUTPUT_PATH = "/content/drive/My Drive/Big Data Project/Data/Processed/Semi_Supervised_Learning_Model.csv" # New output path

# Define pollutants to process
POLLUTANTS = ['PM25', 'O3', 'NO2', 'SO2', 'CO']

# Meaningful thresholds values relevant to health guidelines
THRESHOLDS = {
    'PM25': 9.0,  # EPA AQI "Moderate" 24-hour PM2.5 (ug/m3)
    'O3': 0.070,   # EPA AQI "Moderate" 8-hour Ozone (ppm)
    'NO2': 53,    # EPA 1-hour NO2 standard (ppb)
    'SO2': 0.1,     # EPA 1-hour SO2 standard (ppb)
    'CO': 0.35       # EPA 8-hour CO standard (ppm)
}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# --- 1. Load Data ---
print("--- 1. Loading Data ---")
try:
    df_asthma = pd.read_csv(ASTHMA_DATA_PATH)
    print(f"Asthma data loaded: {df_asthma.shape}")
except FileNotFoundError:
    print(f"ERROR: Asthma data file not found at {ASTHMA_DATA_PATH}")
    df_asthma = pd.DataFrame()

try:
    df_pollutant_raw = pd.read_csv(POLLUTANT_DATA_PATH, low_memory=False)
    print(f"Pollutant data loaded: {df_pollutant_raw.shape}")
except FileNotFoundError:
    print(f"ERROR: Pollutant data file not found at {POLLUTANT_DATA_PATH}")
    df_pollutant_raw = pd.DataFrame()

--- 1. Loading Data ---
Asthma data loaded: (224, 4)
Pollutant data loaded: (98351, 11)


In [19]:
# --- 2. Prepare Pollutant Data ---
if not df_pollutant_raw.empty:
    print("\n--- 2. Preparing Pollutant Data ---")
    df_pollutant = df_pollutant_raw.copy()

    # Convert 'Date Local' to datetime and extract Year/Month
    if 'Date Local' in df_pollutant.columns:
        df_pollutant['Date Local'] = pd.to_datetime(df_pollutant['Date Local'], errors='coerce')
        df_pollutant.dropna(subset=['Date Local'], inplace=True)
        df_pollutant['Year'] = df_pollutant['Date Local'].dt.year
        df_pollutant['Month'] = df_pollutant['Date Local'].dt.month
    else:
        print("ERROR: 'Date Local' column not found.")
        df_pollutant = pd.DataFrame()

    # Standardize 'County Name' and 'State Name'
    if 'County Name' in df_pollutant.columns:
        df_pollutant['County Name'] = df_pollutant['County Name'].astype(str).str.strip().str.lower()
    else:
        print("ERROR: 'County Name' column not found.")
        df_pollutant = pd.DataFrame()

    if 'State Name' in df_pollutant.columns:
        df_pollutant['State Name'] = df_pollutant['State Name'].astype(str).str.strip().str.lower()
    else:
        print("ERROR: 'State Name' column not found.")
        df_pollutant = pd.DataFrame()

    # Ensure pollutants are numeric and floor to 0
    POLLUTANTS_TO_FLOOR = ['PM25', 'O3', 'NO2', 'SO2', 'CO']
    print("--- Flooring Pollutant Levels to 0 ---")
    for p in POLLUTANTS_TO_FLOOR:
        if p in df_pollutant.columns:
            df_pollutant[p] = pd.to_numeric(df_pollutant[p], errors='coerce')
            neg_count_before = (df_pollutant[p] < 0).sum()
            df_pollutant[p] = df_pollutant[p].clip(lower=0)
            print(f"Processed '{p}': Found and floored {neg_count_before} negative value(s).")
        else:
            print(f"Warning: Pollutant column '{p}' not found.")
else:
     df_pollutant = pd.DataFrame() # Ensure it's empty if loading failed


--- 2. Preparing Pollutant Data ---
--- Flooring Pollutant Levels to 0 ---
Processed 'PM25': Found and floored 287 negative value(s).
Processed 'O3': Found and floored 1 negative value(s).
Processed 'NO2': Found and floored 61 negative value(s).
Processed 'SO2': Found and floored 6056 negative value(s).
Processed 'CO': Found and floored 1348 negative value(s).


In [20]:
# --- 3. Define Seasons ---
if not df_pollutant.empty and 'Month' in df_pollutant.columns:
    def get_season(month):
        if month in [12, 1, 2]: return 'Winter'
        elif month in [3, 4, 5]: return 'Spring'
        elif month in [6, 7, 8]: return 'Summer'
        elif month in [9, 10, 11]: return 'Fall'
        return None
    df_pollutant['Season'] = df_pollutant['Month'].apply(get_season)

In [21]:
# --- 4. Feature Engineering (For ALL counties in pollutant data) ---
df_engineered_features = pd.DataFrame()
if not df_pollutant.empty and 'County Name' in df_pollutant.columns and 'State Name' in df_pollutant.columns and 'Year' in df_pollutant.columns:
    print("\n--- 4. Engineering Features ---")
    # Include State Name in groupby
    grouped_annual = df_pollutant.groupby(['County Name', 'State Name', 'Year'])

    aggs = {}
    if 'Latitude' in df_pollutant.columns: aggs['Latitude'] = 'mean'
    if 'Longitude' in df_pollutant.columns: aggs['Longitude'] = 'mean'
    for p in POLLUTANTS:
        if p in df_pollutant.columns:
            aggs[p] = ['mean', 'max', 'std', lambda x: x.quantile(0.75) - x.quantile(0.25)]

    annual_stats = grouped_annual.agg(aggs)
    annual_stats.columns = ['_'.join(col).strip('_') for col in annual_stats.columns.values]
    rename_dict = {'Latitude_mean': 'Latitude', 'Longitude_mean': 'Longitude'}
    for p in POLLUTANTS:
        if p in df_pollutant.columns:
            rename_dict[f'{p}_mean'] = f'{p}_Annual_Mean'
            rename_dict[f'{p}_max'] = f'{p}_Annual_Max'
            rename_dict[f'{p}_std'] = f'{p}_Annual_StdDev'
            lambda_col_name = next((col for col in annual_stats.columns if f'{p}_<lambda' in col), None)
            if lambda_col_name: rename_dict[lambda_col_name] = f'{p}_Annual_IQR'
    annual_stats = annual_stats.rename(columns=rename_dict)

    days_threshold_list = []
    for p in POLLUTANTS:
        if p in THRESHOLDS and p in df_pollutant.columns:
            if p == 'O3': days = grouped_annual[p].apply(lambda x: (x < THRESHOLDS[p]).sum()).rename(f'{p}_Days_Below_Threshold')
            else: days = grouped_annual[p].apply(lambda x: (x > THRESHOLDS[p]).sum()).rename(f'{p}_Days_Above_Threshold')
            days_threshold_list.append(days)
    if days_threshold_list:
        days_threshold_df = pd.concat(days_threshold_list, axis=1)
        annual_stats = annual_stats.merge(days_threshold_df, on=['County Name', 'State Name', 'Year'], how='left')

    seasonal_stats_pivot = pd.DataFrame()
    if 'Season' in df_pollutant.columns:
        grouped_seasonal = df_pollutant.groupby(['County Name', 'State Name', 'Year', 'Season'])
        seasonal_means_list = []
        for p in POLLUTANTS:
            if p in df_pollutant.columns:
                seasonal_means_list.append(grouped_seasonal[p].mean().rename(f'{p}_Seasonal_Avg'))
        if seasonal_means_list:
            seasonal_stats_raw = pd.concat(seasonal_means_list, axis=1)
            seasonal_stats_pivot = seasonal_stats_raw.unstack(level='Season')
            seasonal_stats_pivot.columns = ['_'.join(col).strip() for col in seasonal_stats_pivot.columns.values]

    # Combine all engineered features
    df_engineered_features = annual_stats.reset_index()
    if not seasonal_stats_pivot.empty:
         df_engineered_features = pd.merge(df_engineered_features, seasonal_stats_pivot.reset_index(),
                                          on=['County Name', 'State Name', 'Year'], how='outer')
    print(f"Combined engineered features shape: {df_engineered_features.shape}")
else:
    print("Could not perform feature engineering - check pollutant data and columns.")


--- 4. Engineering Features ---
Combined engineered features shape: (341, 50)


In [22]:
# --- 5. Prepare Asthma Data & Create Hotspot Label ---
if not df_asthma.empty:
    print("\n--- 5. Preparing Asthma Data & Hotspot Labels ---")
    # Standardize 'County Name' and 'State Name'
    df_asthma['County Name'] = df_asthma['County Name'].astype(str).str.strip().str.lower()

    # Calculate Hotspot label *before* merge
    target_col = 'Age-adjusted rate per 10,000'
    if target_col in df_asthma.columns:
        # Create Hotspot only where target exists
        df_asthma['Hotspot'] = df_asthma.groupby('Year')[target_col]\
                                  .transform(lambda x: (x > x.median()).astype(float)) # Use float for NaN
        # Set Hotspot to NaN where target is NaN
        df_asthma.loc[df_asthma[target_col].isna(), 'Hotspot'] = np.nan
        print("Hotspot labels created.")
    else:
        print(f"Warning: Target column '{target_col}' not found in asthma data. Cannot create Hotspot labels.")
        df_asthma['Hotspot'] = np.nan # Ensure column exists but is empty

    # Select only relevant columns for merging
    asthma_cols_to_merge = [
        'County Name', 'Year',
        'Age-adjusted rate per 10,000', 'Number of cases', 'Hotspot'
    ]
    # Ensure all selected columns actually exist
    asthma_cols_to_merge = [col for col in asthma_cols_to_merge if col in df_asthma.columns]
    df_asthma_subset = df_asthma[asthma_cols_to_merge].copy()
    # Drop duplicates in asthma data before merge to avoid issues
    df_asthma_subset.drop_duplicates(subset=['County Name', 'Year'], inplace=True)

else:
    df_asthma_subset = pd.DataFrame()


--- 5. Preparing Asthma Data & Hotspot Labels ---
Hotspot labels created.


In [23]:
import pandas as pd
import numpy as np

# --- 6. Merge ALL Features with Asthma Data (Left on Features) ---
if not df_engineered_features.empty:
    print("\n--- 6. Merging All Data ---")
    if not df_asthma_subset.empty:
        # Perform a LEFT merge to keep ALL counties and add asthma data where it matches
        df_final = pd.merge(df_engineered_features, df_asthma_subset,
                            on=['County Name', 'Year'],
                            how='left')
        # print("Merged engineered features with asthma data.") # Removed
    else:
        print("Asthma data is empty, final data will only contain engineered features.")
        df_final = df_engineered_features.copy()

    # print(f"Final merged data shape (before removing columns): {df_final.shape}") # Removed

    # --- 7. Modify Columns, Reorder, and Save ---
    print(f"\n--- 7. Modifying Columns, Reordering, and Preparing for Save ---")

    if not df_final.empty:
        # --- Remove Specified Columns ---
        cols_to_remove = ['Number of cases', 'Age-adjusted rate per 10,000']
        for col in cols_to_remove:
            if col in df_final.columns:
                df_final = df_final.drop(columns=[col])
                # print(f"Removed '{col}' column. New shape: {df_final.shape}") # Removed
            else:
                print(f"Warning: Column '{col}' not found, nothing to remove.")

        # --- Move 'Hotspot' column before 'PM25_Annual_Mean' ---
        col_to_move = 'Hotspot'
        insertion_point_col = 'PM25_Annual_Mean'

        if col_to_move in df_final.columns and insertion_point_col in df_final.columns:
            current_cols = df_final.columns.tolist()
            current_cols.remove(col_to_move)
            idx_insertion = current_cols.index(insertion_point_col)
            current_cols.insert(idx_insertion, col_to_move)
            df_final = df_final[current_cols]
            # print(f"Moved '{col_to_move}' column before '{insertion_point_col}'.") # Removed
        elif col_to_move not in df_final.columns:
            print(f"Warning: Column '{col_to_move}' not found. Cannot move it.")
        elif insertion_point_col not in df_final.columns:
            print(f"Warning: Insertion point column '{insertion_point_col}' not found. Cannot move '{col_to_move}'.")

        # --- Print Head of Final DataFrame ---
        print("\n--- Head of Final Processed DataFrame ---")
        print(df_final.head())
        print(f"\nShape of final DataFrame: {df_final.shape}")
        print(f"Columns in final DataFrame: {df_final.columns.tolist()}")


        # Save the modified DataFrame
        # Ensure OUTPUT_PATH is defined from your main script configuration
        df_final.to_csv(OUTPUT_PATH, index=False)
        print(f"\nSaved final data to: {OUTPUT_PATH}")
    else:
        print("df_final is empty (or became empty after removals). Nothing to save or reorder.")

else: # This 'else' corresponds to 'if not df_engineered_features.empty:'
    print("\nFinal dataset could not be created as no engineered features were generated.")


--- 6. Merging All Data ---

--- 7. Modifying Columns, Reordering, and Preparing for Save ---

--- Head of Final Processed DataFrame ---
  County Name State Name  Year   Latitude   Longitude  Hotspot  \
0         ada      idaho  2020  43.600699 -116.347853      NaN   
1         ada      idaho  2021  43.600699 -116.347853      NaN   
2         ada      idaho  2022  43.600699 -116.347853      NaN   
3         ada      idaho  2023  43.600699 -116.347853      NaN   
4         ada      idaho  2024  43.600699 -116.347853      NaN   

   PM25_Annual_Mean  PM25_Annual_Max  PM25_Annual_StdDev  PM25_Annual_IQR  \
0          7.774561             49.2            8.302082            4.975   
1          8.714545             82.6           10.878881            4.950   
2          7.499138             42.5            6.156418            6.450   
3          5.888073             25.9            4.447445            4.700   
4          4.489286             41.9            5.764994            2.975   

  

### Label Prediction using Graph Neural Network

In [24]:
!pip install torch torchvision torchaudio
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-$(python -c 'import torch; print(torch.__version__)').html
!pip install pandas scikit-learn

Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html


In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn as nn
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

In [26]:
# --- Configuration ---
FILE_PATH = "/content/drive/My Drive/Big Data Project/Data/Processed/Semi_Supervised_Learning_Model_Prediction.csv"
N_NEIGHBORS = 4
HIDDEN_DIM_GNN = 64
OUTPUT_DIM_GNN = 2    # Hotspot (1) / Not Hotspot (0)
EPOCHS_GNN = 500      # Epochs per year; adjust as needed
LEARNING_RATE_GNN = 0.01
WEIGHT_DECAY_GNN = 5e-4
YEARS_TO_PROCESS = [2020, 2021, 2022, 2023] # Years you want to model

In [27]:
# --- Main Loop for Processing Each Year ---
all_yearly_results_df = pd.DataFrame()

print(f"--- Starting GNN Processing for Years: {YEARS_TO_PROCESS} ---")

try:
    df_full_dataset = pd.read_csv(FILE_PATH)
    print(f"Full dataset loaded. Shape: {df_full_dataset.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found at '{FILE_PATH}'. Please ensure it's uploaded or the path is correct.")
    # exit() # Consider exiting or handling as appropriate for your environment
except Exception as e:
    print(f"An error occurred loading the full dataset: {e}")
    # exit()

# Check if df_full_dataset was loaded
if 'df_full_dataset' not in locals() or df_full_dataset.empty:
    print("Exiting script as full dataset could not be loaded.")
    exit()


for current_year in YEARS_TO_PROCESS:
    print(f"\n\n{'='*15} Processing Year: {current_year} {'='*15}")

    try:
        # --- 1. Data Loading and Preprocessing for the Current Year ---
        print(f"--- 1.1. Preprocessing Data for {current_year} ---")
        df_year = df_full_dataset[df_full_dataset['Year'] == current_year].copy()

        if df_year.empty:
            print(f"No data found for {current_year}. Skipping.")
            continue

        original_rows_for_year = len(df_year)

        # Ensure 'State Name' exists
        if 'State Name' not in df_year.columns:
            print(f"Warning: 'State Name' column missing for {current_year}. Skipping year.")
            continue
        df_year.dropna(subset=['State Name'], inplace=True)
        df_year = df_year[df_year['State Name'].astype(str).str.strip() != ''].copy()

        # Use engineered coordinates, fallback if necessary, then drop NaNs
        lat_col, lon_col = 'Eng_Latitude', 'Eng_Longitude'
        if not (lat_col in df_year.columns and lon_col in df_year.columns):
            print(f"Warning: Engineered coordinates not found for {current_year}, trying 'Latitude', 'Longitude'.")
            lat_col, lon_col = 'Latitude', 'Longitude'
        if not (lat_col in df_year.columns and lon_col in df_year.columns):
            print(f"ERROR: Essential Latitude/Longitude columns missing for {current_year}. Skipping year.")
            continue
        df_year.dropna(subset=[lat_col, lon_col], inplace=True)

        # Sort and create unique node IDs for this year's graph
        df_year.sort_values(by=['State Name', 'County Name'], inplace=True)
        df_year = df_year.drop_duplicates(subset=['State Name', 'County Name'], keep='first').reset_index(drop=True)
        df_year['GlobalNodeID_Year'] = df_year.index # Node ID specific to this year's graph
        n_total_nodes_year = len(df_year)

        if n_total_nodes_year == 0:
            print(f"No data remaining after preprocessing for {current_year}. Skipping.")
            continue
        print(f"Processing {n_total_nodes_year} unique counties for {current_year} (Original rows for year: {original_rows_for_year}).")

        # Define features
        start_feature_col = 'PM25_Annual_Mean'
        all_cols = df_year.columns.tolist()
        try:
            start_index = all_cols.index(start_feature_col)
            potential_features = all_cols[start_index:]
            exclude_cols = [
                'GlobalNodeID_Year', 'Hotspot', 'Age-adjusted rate per 10,000', 'Number of cases',
                'Year', # Year column itself is not a feature for the snapshot
                'County Name', 'State Name', 'Latitude', 'Longitude',
                'Eng_Latitude', 'Eng_Longitude', 'CountyID' # Old ID if it exists
            ]
            features_cols = [col for col in potential_features if col not in exclude_cols]
            if not features_cols: raise ValueError("No feature columns selected.")
        except ValueError as e:
            print(f"Error selecting features for {current_year}: {e}. Skipping year.")
            continue

        # Impute and Scale features
        node_features_raw = df_year[features_cols].copy()
        imputer = SimpleImputer(strategy='median')
        node_features_imputed = imputer.fit_transform(node_features_raw)
        scaler = StandardScaler()
        node_features_scaled = scaler.fit_transform(node_features_imputed)
        node_features_tensor = torch.tensor(node_features_scaled, dtype=torch.float)

        # Prepare labels and train_mask for this year
        labels_array = np.full(n_total_nodes_year, -1, dtype=np.int64)
        df_year['State Name'] = df_year['State Name'].astype(str).str.strip().str.lower() # Standardize

        # Use 'Hotspot' column directly (it should be 0/1 for CA, NaN for others from previous script)
        # If 'Hotspot' column is missing, this will fail. Ensure it's in your CSV.
        if 'Hotspot' not in df_year.columns:
            print(f"ERROR: 'Hotspot' column missing for {current_year}. Skipping year.")
            continue

        california_mask_df = (df_year['State Name'] == 'california')
        # Only use rows where 'Hotspot' is not NaN for labeling
        labeled_ca_indices = df_year.loc[california_mask_df & df_year['Hotspot'].notna(), 'GlobalNodeID_Year'].values
        labeled_ca_hotspot_values = df_year.loc[california_mask_df & df_year['Hotspot'].notna(), 'Hotspot'].values.astype(np.int64)

        if len(labeled_ca_indices) > 0:
            labels_array[labeled_ca_indices] = labeled_ca_hotspot_values

        labels_tensor = torch.tensor(labels_array, dtype=torch.long)
        train_mask_tensor = (labels_tensor != -1) # Train on all nodes with a valid label (0 or 1)

        print(f"Node features shape for {current_year}: {node_features_tensor.shape}")
        print(f"Number of labeled (training) nodes for {current_year}: {train_mask_tensor.sum().item()}")

        # --- 2. Graph Construction for Current Year ---
        print(f"--- 2.1. Building K-NN Graph for {current_year} ---")
        coordinates = df_year[[lat_col, lon_col]].values
        adj_matrix = kneighbors_graph(coordinates, N_NEIGHBORS, mode='connectivity', include_self=False)
        edge_index_sparse = adj_matrix.tocoo()
        edge_index_tensor = torch.tensor([edge_index_sparse.row, edge_index_sparse.col], dtype=torch.long)
        print(f"Graph for {current_year} built with {edge_index_tensor.shape[1]} edges.")

        pyg_data = Data(x=node_features_tensor, edge_index=edge_index_tensor, y=labels_tensor, train_mask=train_mask_tensor)

        # --- 4. GNN Model Definition (Instantiate new model for each year) ---
        class GCN(nn.Module):
            def __init__(self, num_node_features, hidden_dim, num_classes):
                super(GCN, self).__init__()
                self.conv1 = GCNConv(num_node_features, hidden_dim)
                self.conv2 = GCNConv(hidden_dim, num_classes)
                self.dropout = nn.Dropout(p=0.5)

            def forward(self, data_obj):
                x, edge_index = data_obj.x, data_obj.edge_index
                x = self.conv1(x, edge_index)
                x = F.relu(x)
                x = self.dropout(x)
                x = self.conv2(x, edge_index)
                return F.log_softmax(x, dim=1)

        gnn_model = GCN(num_node_features=pyg_data.x.shape[1],
                        hidden_dim=HIDDEN_DIM_GNN,
                        num_classes=OUTPUT_DIM_GNN)
        optimizer = torch.optim.Adam(gnn_model.parameters(), lr=LEARNING_RATE_GNN, weight_decay=WEIGHT_DECAY_GNN)
        criterion = nn.NLLLoss()

        # --- 5. Training Loop for Current Year ---
        if pyg_data.train_mask.sum().item() > 0: # Only train if there are labeled nodes
            print(f"--- 5.1. Starting Training for {current_year} ---")
            gnn_model.train()
            for epoch in range(EPOCHS_GNN):
                optimizer.zero_grad()
                out = gnn_model(pyg_data)
                loss = criterion(out[pyg_data.train_mask], pyg_data.y[pyg_data.train_mask])
                loss.backward()
                optimizer.step()

                if (epoch + 1) % 50 == 0:
                    gnn_model.eval()
                    pred_eval = gnn_model(pyg_data).argmax(dim=1)
                    correct_train_eval = (pred_eval[pyg_data.train_mask] == pyg_data.y[pyg_data.train_mask]).sum()
                    acc_train_eval = int(correct_train_eval) / int(pyg_data.train_mask.sum())
                    print(f"Year {current_year}, Epoch {epoch+1:03d}/{EPOCHS_GNN}, Loss: {loss.item():.4f}, Train Acc: {acc_train_eval:.4f}")
                    gnn_model.train()
            print(f"Training finished for {current_year}.")
        else:
            print(f"Skipping training for {current_year} - no labeled data found.")

        # --- 6. Prediction & Store Results for Current Year ---
        print(f"--- 6.1. Generating Predictions for {current_year} ---")
        gnn_model.eval()
        with torch.no_grad():
            all_predictions_log_softmax = gnn_model(pyg_data)
            all_predictions = all_predictions_log_softmax.argmax(dim=1)

        df_year['Predicted_Hotspot_GNN'] = all_predictions.cpu().numpy()

        # Evaluate on California (Labeled Data) for this year
        ca_df_eval_year = df_year[df_year['State Name'] == 'california'].copy()
        ca_df_eval_year = ca_df_eval_year[ca_df_eval_year['Hotspot'].notna()]
        if not ca_df_eval_year.empty and pyg_data.train_mask.sum().item() > 0: # Only eval if trained
            true_ca_labels_eval = ca_df_eval_year['Hotspot'].astype(int)
            pred_ca_labels_eval = ca_df_eval_year['Predicted_Hotspot_GNN']
            print(f"\n--- Evaluation on Labeled CA Counties for {current_year} ---")
            print(f"Accuracy: {accuracy_score(true_ca_labels_eval, pred_ca_labels_eval):.4f}")
            print(classification_report(true_ca_labels_eval, pred_ca_labels_eval, zero_division=0, labels=[0,1]))

        # Append current year's relevant results to the main collection DataFrame
        # Ensure to select columns that exist in df_year
        cols_to_append = ['Year', 'State Name', 'County Name', 'Hotspot', 'Predicted_Hotspot_GNN']
        cols_present_in_df_year = [col for col in cols_to_append if col in df_year.columns]
        all_yearly_results_df = pd.concat([all_yearly_results_df, df_year[cols_present_in_df_year]], ignore_index=True)

    except Exception as e:
        print(f"An error occurred while processing year {current_year}: {e}")
        import traceback
        traceback.print_exc()
        continue # Move to the next year if an error occurs

--- Starting GNN Processing for Years: [2020, 2021, 2022, 2023] ---
ERROR: File not found at '/content/drive/My Drive/Big Data Project/Data/Processed/Semi_Supervised_Learning_Model_Prediction.csv'. Please ensure it's uploaded or the path is correct.


--- 1.1. Preprocessing Data for 2020 ---
Processing 65 unique counties for 2020 (Original rows for year: 65).
Node features shape for 2020: torch.Size([65, 45])
Number of labeled (training) nodes for 2020: 12
--- 2.1. Building K-NN Graph for 2020 ---
Graph for 2020 built with 260 edges.
--- 5.1. Starting Training for 2020 ---
Year 2020, Epoch 050/500, Loss: 0.4202, Train Acc: 0.7500
Year 2020, Epoch 100/500, Loss: 0.2148, Train Acc: 0.9167
Year 2020, Epoch 150/500, Loss: 0.2110, Train Acc: 1.0000
Year 2020, Epoch 200/500, Loss: 0.2273, Train Acc: 0.9167
Year 2020, Epoch 250/500, Loss: 0.1960, Train Acc: 1.0000
Year 2020, Epoch 300/500, Loss: 0.2115, Train Acc: 0.9167
Year 2020, Epoch 350/500, Loss: 0.1880, Train Acc: 1.0000
Year 2020, Epo

In [29]:
# --- 7. Final Output ---
print("\n\n--- All Yearly Predictions Summary ---")
if not all_yearly_results_df.empty:
    print(all_yearly_results_df.head())
    print(f"\nShape of combined predictions: {all_yearly_results_df.shape}")
    print("\nValue counts for GNN predictions across all years:")
    # Check if 'Predicted_Hotspot_GNN' exists before value_counts
    if 'Predicted_Hotspot_GNN' in all_yearly_results_df.columns:
        print(all_yearly_results_df['Predicted_Hotspot_GNN'].value_counts(dropna=False))
    else:
        print("'Predicted_Hotspot_GNN' column not found in the final results.")

    # Save the combined predictions
    output_all_years_path = "/content/drive/My Drive/Big Data Project/Data/Processed/all_years_gnn_predictions_semi_supervised.csv"
    all_yearly_results_df.to_csv(output_all_years_path, index=False)
    print(f"Saved all yearly predictions to {output_all_years_path}")
else:
    print("No predictions were generated for any year.")



--- All Yearly Predictions Summary ---
   Year  State Name County Name  Hotspot  Predicted_Hotspot_GNN
0  2020     alabama   jefferson      NaN                      0
1  2020     arizona    maricopa      NaN                      1
2  2020     arizona        pima      NaN                      1
3  2020    arkansas     pulaski      NaN                      1
4  2020  california     alameda      1.0                      1

Shape of combined predictions: (277, 5)

Value counts for GNN predictions across all years:
Predicted_Hotspot_GNN
1    163
0    114
Name: count, dtype: int64
Saved all yearly predictions to /content/drive/My Drive/Big Data Project/Data/Processed/all_years_gnn_predictions_semi_supervised.csv
