# Set up Autogluon and get data

In [1]:
from autogluon.tabular import TabularPredictor, TabularDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import gc
import os
import glob

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("../Data/datasets/Train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Unique,Id,Day,Lat,Lon,Elevation,Emissions,Forest,Roads,...,Plumes_Med,Plumes_Low,Max_Temp,Max_Wind,Precip,Rel_Humidity,Wind_Dir,BLH,AOD,PM
0,1,1131_1,1131,1,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,53.4,5.3,0.0,67.0,251.0,758.111816,,3.7
1,2,1131_100,1131,100,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,64.1,3.6,0.0,48.0,308.0,1155.824219,,2.6
2,3,1131_101,1131,101,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,62.7,5.1,0.0,64.0,200.0,1162.292725,,2.9
3,4,1131_102,1131,102,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,67.7,7.9,0.0,68.0,221.0,263.273407,,0.5
4,5,1131_104,1131,104,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,60.1,6.5,0.0,44.0,314.0,1268.919434,0.097667,2.4


# Crossvalidate
Make tables of crossvalidated predictions by station using Autogluon \\

In [6]:
def cv(df):
    n_fold = len(set(df['Id']))
    kf = GroupKFold(n_fold)
    split = kf.split(df, groups=df['Id'])

    metric_df = pd.DataFrame(columns=['station', 'Day', 'PM', 'PM_pred', 'rmse', 'bias'])

    for i, (train_idx, test_idx) in enumerate(split):
        print(f'Starting training fold {i}.')
        _ = gc.collect()

        # Calculate the daily mean PM for each day, excluding the current validation set
        validation_station = df.loc[test_idx, 'Id'].iloc[0]
        train_df = df[~df['Id'].isin([validation_station])]
        daily_mean_PM_excl_validation = train_df.groupby('Day')['PM'].mean()

        # Add the daily mean PM (excluding validation station) as a feature to the original df
        df['Daily_Mean_PM_Excl_Val'] = df['Day'].map(daily_mean_PM_excl_validation)

        features = ['Day', 'Lat', 'Lon', 'Elevation', 'Emissions', 'Forest',
                    'Roads', 'Streets', 'Plumes_High', 'Plumes_Med', 'Plumes_Low',
                    'Max_Temp', 'Max_Wind', 'Precip', 'Rel_Humidity', 'Wind_Dir', 'BLH',
                    'AOD', 'Daily_Mean_PM_Excl_Val']  # Include the new feature

        # Adjust y by subtracting the mean PM for each day (excluding validation station)
        df['Adjusted_PM'] = df['PM'] - df['Daily_Mean_PM_Excl_Val']
        label_column = 'Adjusted_PM'

        X = df[features]
        y = df[label_column]

        # Ensure the index alignment between X and y when slicing
        train_data = TabularDataset(pd.concat([X.loc[train_idx], y.loc[train_idx]], axis=1))
        test_data = TabularDataset(pd.concat([X.loc[test_idx], y.loc[test_idx]], axis=1))

        predictor = TabularPredictor(label=label_column, eval_metric='r2').fit(train_data=train_data)
        
        # task.fit(train_data=train_data, 
        #                     label=label_column, 
        #                     hyperparameters=hyperparameters, 
        #                     # time_limits = 60*60, # un-comment these for the tuned neural network. Train for 1 hour. 
        #                     # hyperparameter_tune=True, 
        #                     eval_metric='r2')

        y_test = test_data[label_column]
        test_data_nolab = test_data.drop(labels=[label_column], axis=1)
        y_pred = predictor.predict(test_data_nolab)

        # Calculate metrics
        bias = y_pred - y_test
        rmse = np.abs(bias)

        station = np.repeat(df.loc[test_idx]['Id'].iloc[0], len(test_data['Day']))
        df_to_append = pd.DataFrame({'station': station, 'Day': test_data['Day'], 'PM': y_test + df.loc[test_idx, 'Daily_Mean_PM_Excl_Val'], 'PM_pred': y_pred + df.loc[test_idx, 'Daily_Mean_PM_Excl_Val'], 'rmse': rmse, 'bias': bias})

        metric_df = metric_df.append(df_to_append, ignore_index=True)

    return metric_df

In [8]:
AutoGluon = cv(df)
AutoGluon.to_csv("Data/output/CV/AutoGluon_spatial_var.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20240208_230446"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240208_230446"
AutoGluon Version:  1.0.0
Python Version:     3.8.18
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 20.3.0: Thu Jan 21 00:07:06 PST 2021; root:xnu-7195.81.3~1/RELEASE_X86_64
CPU Count

Starting training fold 0.


	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 14 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 14 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.2s = Fit runtime
	19 features in original data used to generate 19 features in processed data.
	Train Data (Processed) Memory Usage: 1.13 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.24s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatic

AttributeError: 'DataFrame' object has no attribute 'append'

# Consolidate dataframes

In [None]:
small_ids = pd.read_csv("Data/datasets/Small_Ids.csv")['x']
small_ids = [float(i) for i in small_ids]

full_df = pd.DataFrame(columns = ['Model', 'test_area', 'station', 'Day', 'PM', 'PM_pred'])

paths = glob.glob("Data/output/CV/*.csv")

for path in paths:
  f = os.path.split(path)[1]
  Model = os.path.splitext(f)[0]
  Table = pd.read_csv(path)
  Table['test_area'] = Table['station'].isin(small_ids)
  Table['Model'] = Model

  full_df = full_df.append(Table, ignore_index = True)

full_df.to_csv("Data/output/all_cv_preds.csv", index = False)