<a href="https://colab.research.google.com/github/anna-boser/AutoGluon_PM_paper/blob/main/ML_model_random_CV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up Autogluon and get data

In [1]:
# Uninstall mkl for faster neural-network training time
!pip uninstall -y mkl
# Upgrade pip to ensure the latest package versions are available
!pip install -U pip
# Upgrade setuptools to be compatible with namespace packages
!pip install -U setuptools wheel
!pip install -U "mxnet<2.0.0"
# Install autogluon (Tutorial based on autogluon==0.1.0)
!pip install autogluon
# Upgrade ipykernel (Necessary for Colab)
!pip install -U ipykernel

Found existing installation: mkl 2019.0
Uninstalling mkl-2019.0:
  Successfully uninstalled mkl-2019.0
Collecting pip
  Downloading pip-22.0.4-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 5.4 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.0.4
Collecting setuptools
  Downloading setuptools-62.1.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source 

IMPORTANT: PLEASE READ BELOW INSTRUCTIONS

MANUAL STEP: Restart Colab Runtime, then execute remaining cells

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import gc
import math
import os
import glob

In [3]:
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/NASA PM_estimation/')

Mounted at /content/drive


In [4]:
df = pd.read_csv("Data/datasets/Train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Unique,Id,Day,Lat,Lon,Elevation,Emissions,Forest,Roads,...,Plumes_Med,Plumes_Low,Max_Temp,Max_Wind,Precip,Rel_Humidity,Wind_Dir,BLH,AOD,PM
0,1,1131_1,1131,1,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,53.4,5.3,0.0,67.0,251.0,758.111816,,3.7
1,2,1131_100,1131,100,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,64.1,3.6,0.0,48.0,308.0,1155.824219,,2.6
2,3,1131_101,1131,101,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,62.7,5.1,0.0,64.0,200.0,1162.292725,,2.9
3,4,1131_102,1131,102,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,67.7,7.9,0.0,68.0,221.0,263.273407,,0.5
4,5,1131_104,1131,104,36.840574,-121.366314,122.986002,0.0,0,0.0,...,0,0,60.1,6.5,0.0,44.0,314.0,1268.919434,0.097667,2.4


# Crossvalidate 
Make tables of crossvalidated predictions by station for various models \\
\
Models chosen are those from Autogluon default: \\
\
hyperparameters = {
‘NN’: {}, ‘GBM’: [

{}, {‘extra_trees’: True, ‘AG_args’: {‘name_suffix’: ‘XT’}},

], ‘CAT’: {}, ‘RF’: [

{‘criterion’: ‘gini’, ‘AG_args’: {‘name_suffix’: ‘Gini’, ‘problem_types’: [‘binary’, ‘multiclass’]}}, {‘criterion’: ‘entropy’, ‘AG_args’: {‘name_suffix’: ‘Entr’, ‘problem_types’: [‘binary’, ‘multiclass’]}}, {‘criterion’: ‘mse’, ‘AG_args’: {‘name_suffix’: ‘MSE’, ‘problem_types’: [‘regression’]}},

], ‘XT’: [

{‘criterion’: ‘gini’, ‘AG_args’: {‘name_suffix’: ‘Gini’, ‘problem_types’: [‘binary’, ‘multiclass’]}}, {‘criterion’: ‘entropy’, ‘AG_args’: {‘name_suffix’: ‘Entr’, ‘problem_types’: [‘binary’, ‘multiclass’]}}, {‘criterion’: ‘mse’, ‘AG_args’: {‘name_suffix’: ‘MSE’, ‘problem_types’: [‘regression’]}},

], ‘KNN’: [

{‘weights’: ‘uniform’, ‘AG_args’: {‘name_suffix’: ‘Unif’}}, {‘weights’: ‘distance’, ‘AG_args’: {‘name_suffix’: ‘Dist’}},

], ‘custom’: [‘GBM’]

}

In [5]:
def cv(hyperparameters):
  df['fold'] = np.random.randint(1, 11, df.shape[0]) # 10 fold random CV
  n_fold = len(set(df['fold']))
  print(n_fold) #should be 10
  kf = GroupKFold(n_fold)
  split = kf.split(df, groups = df['fold'])

  metric_df = pd.DataFrame(columns = ['station', 'Day', 'PM', 'PM_pred', 'rmse', 'bias'])

  for i, (train_idx, test_idx) in enumerate(split):
    print(f'Starting training fold {i}.')
    _ = gc.collect()

    features = ['Day', 'Lat', 'Lon', 'Elevation', 'Emissions', 'Forest',
        'Roads', 'Streets', 'Plumes_High', 'Plumes_Med', 'Plumes_Low',
        'Max_Temp', 'Max_Wind', 'Precip', 'Rel_Humidity', 'Wind_Dir', 'BLH',
        'AOD']
    label_column = 'PM'

    X = df[features]
    y = df[label_column]

    train_data = TabularDataset(pd.concat([X.loc[train_idx], y.loc[train_idx]], axis=1))
    test_data = TabularDataset(pd.concat([X.loc[test_idx], y.loc[test_idx]], axis=1))

    predictor = TabularPredictor(label=label_column, eval_metric='r2').fit(train_data=train_data, hyperparameters=hyperparameters)

    y_test = test_data[label_column]

    test_data_nolab = test_data.drop(labels = [label_column], axis = 1)

    y_pred = predictor.predict(test_data_nolab)
    
    bias = y_pred-y_test
    rmse = np.abs(bias)

    station = np.repeat(df.loc[test_idx]['Id'].iloc[0], len(test_data['Day']))
    df_to_append = pd.DataFrame({'station': station, 'Day': test_data['Day'], 'PM':y_test, 'PM_pred': y_pred, 'rmse': rmse, 'bias':bias})

    metric_df = metric_df.append(df_to_append, ignore_index = True)

  return metric_df

In [6]:
CAT = cv({'CAT': {}})
CAT.to_csv("Data/output/CV_random/CAT.csv", index = False)

10
Starting training fold 0.


No path specified. Models will be saved in: "AutogluonModels/ag-20220424_231842/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_231842/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8851
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.79009, 7.75768)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12238.44 MB
	Train Data (Original)  Memory Usage: 1.27 MB (0.0% of available memory)
	Inferring data type o

Starting training fold 1.


	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 7993, Val Rows: 889
Fitting 1 L1 models ...
Fitting m

Starting training fold 2.


	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8004, Val Rows: 890
Fitting 1 L1 models ...
Fitting m

Starting training fold 3.


Fitting 1 L1 models ...
Fitting model: CatBoost ...
	0.7727	 = Validation score   (r2)
	123.27s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7727	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 123.57s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_232453/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_232657/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_232657/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8904
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stdd

Starting training fold 4.


	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8013, Val Rows: 891
Fitting 1 L1 models ...
Fitting m

Starting training fold 5.


	0.8251	 = Validation score   (r2)
	123.59s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8251	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 123.89s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_232900/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_233104/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_233104/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8924
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.8158, 7.79539)
	If 'regression' 

Starting training fold 6.


Fitting 1 L1 models ...
Fitting model: CatBoost ...
	0.8173	 = Validation score   (r2)
	122.61s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8173	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 122.93s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_233104/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_233308/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_233308/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8940
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stdd

Starting training fold 7.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify

Starting training fold 8.


		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8048, Val Ro

Starting training fold 9.


	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8049, Val Rows: 895
Fitting 1 L1 models ...
Fitting 

In [7]:
NN = cv({'NN': {}})
NN.to_csv("Data/output/CV_random/NN.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220424_233847/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_233847/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8849
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.80299, 7.72722)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...


10
Starting training fold 0.


Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12094.79 MB
	Train Data (Original)  Memory Usage: 1.27 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		(

Starting training fold 1.


	0.6572	 = Validation score   (r2)
	79.28s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6572	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 79.76s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_234043/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_234203/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_234203/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8872
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.84812, 7.78869)
	If 'regression' i

Starting training fold 2.


Fitting 1 L1 models ...
Fitting model: NeuralNetMXNet ...
	0.6126	 = Validation score   (r2)
	45.62s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6126	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 46.1s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_234203/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_234249/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_234249/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8888
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, s

Starting training fold 3.


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon wi

Starting training fold 4.


Fitting 1 L1 models ...
Fitting model: NeuralNetMXNet ...
	0.5221	 = Validation score   (r2)
	28.86s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.5221	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 29.35s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_234341/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_234411/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_234411/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8936
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, 

Starting training fold 5.


Fitting model: NeuralNetMXNet ...
	0.6844	 = Validation score   (r2)
	64.57s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6844	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 65.08s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_234411/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_234516/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_234516/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8940
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.

Starting training fold 6.


	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8046, Val Rows: 894
Fitting 1 L1 models ...
Fitting m

Starting training fold 7.


	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Tra

Starting training fold 8.


	0.6455	 = Validation score   (r2)
	37.52s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6455	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 37.96s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_234725/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_234803/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_234803/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8959
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (170.6, 0.1, 8.77485, 7.54417)
	If 'regression' i

Starting training fold 9.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify 

In [8]:
GBM = cv({'GBM': {}})
GBM.to_csv("Data/output/CV_random/GBM.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235404/"


10
Starting training fold 0.


Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235404/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8848
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.82538, 7.71494)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12465.86 MB
	Train Data (Original)  Memory Usage: 1.27 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify

Starting training fold 1.


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.2s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing

[1000]	valid_set's l2: 8.50861	valid_set's r2: 0.800117


	0.8021	 = Validation score   (r2)
	8.84s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8021	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 10.24s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235413/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235424/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235424/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8877
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.83753, 7.81792)
	If 'regression' is

Starting training fold 2.


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.2s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.31s ...
AutoGluon wi

[1000]	valid_set's l2: 7.12128	valid_set's r2: 0.860418


	0.8642	 = Validation score   (r2)
	11.26s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8642	 = Validation score   (r2)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 12.49s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235424/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235437/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235437/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8904
Train Data Columns: 18
Label Column: PM
Preprocessing data ...


Starting training fold 3.


AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.78231, 7.70126)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11667.26 MB
	Train Data (Original)  Memory Usage: 1.28 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting Iden

[1000]	valid_set's l2: 9.52059	valid_set's r2: 0.818894


	0.8228	 = Validation score   (r2)
	4.56s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8228	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 5.29s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235437/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235443/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235443/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8904
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (108.6, 0.1, 8.78777, 7.32937)
	If 'regression' is 

Starting training fold 4.


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing

[1000]	valid_set's l2: 8.95693	valid_set's r2: 0.860643


	0.86	 = Validation score   (r2)
	2.26s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.86	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 2.76s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235443/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235446/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235446/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8908
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.80929, 7.75725)
	If 'regression' is not 

Starting training fold 5.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify

[1000]	valid_set's l2: 11.0342	valid_set's r2: 0.77015
[2000]	valid_set's l2: 10.6772	valid_set's r2: 0.776486
[3000]	valid_set's l2: 10.5022	valid_set's r2: 0.781991


	0.7835	 = Validation score   (r2)
	6.74s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7835	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.87s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235446/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235454/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235454/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8920
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.85898, 7.83486)
	If 'regression' is 

Starting training fold 6.
[1000]	valid_set's l2: 25.1518	valid_set's r2: 0.737579
[2000]	valid_set's l2: 24.0805	valid_set's r2: 0.747618
[3000]	valid_set's l2: 24.0305	valid_set's r2: 0.75033


	0.7496	 = Validation score   (r2)
	7.28s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7496	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 8.44s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235454/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235503/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235503/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8924
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.80899, 7.78257)
	If 'regression' is 

Starting training fold 7.


	Train Data (Original)  Memory Usage: 1.29 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes

Starting training fold 8.
[1000]	valid_set's l2: 6.02935	valid_set's r2: 0.852862
[2000]	valid_set's l2: 5.73312	valid_set's r2: 0.859973
[3000]	valid_set's l2: 5.62252	valid_set's r2: 0.862773
[4000]	valid_set's l2: 5.57903	valid_set's r2: 0.865696
[5000]	valid_set's l2: 5.60014	valid_set's r2: 0.864838


	0.866	 = Validation score   (r2)
	10.04s	 = Training   runtime
	0.29s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.866	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 11.66s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235505/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220424_235518/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_235518/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8988
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.83128, 7.8337)
	If 'regression' is n

Starting training fold 9.


Fitting 1 L1 models ...
Fitting model: LightGBM ...


[1000]	valid_set's l2: 8.21356	valid_set's r2: 0.80831


	0.8102	 = Validation score   (r2)
	2.52s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8102	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 3.05s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_235518/")


In [9]:
RF = cv({'RF': {}})
RF.to_csv("Data/output/CV_random/RF.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000340/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_000340/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8865
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.82962, 7.78289)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11766.61 MB
	Train Data (Original)  Memory Usage: 1.28 MB (0.0% of available memory)
	Inferring data type o

10
Starting training fold 0.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify 

Starting training fold 1.


	0.7074	 = Validation score   (r2)
	12.93s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7074	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.66s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000356/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000411/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_000411/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8876
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.80362, 7.64351)
	If 'regression' is

Starting training fold 2.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify

Starting training fold 3.


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon wi

Starting training fold 4.


	0.7582	 = Validation score   (r2)
	12.86s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7582	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 15.01s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000442/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000458/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_000458/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8929
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.81688, 7.79731)
	If 'regression' i

Starting training fold 5.


	0.711	 = Validation score   (r2)
	13.03s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.711	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.74s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000458/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000513/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_000513/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8939
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.85959, 7.85278)
	If 'regression' is n

Starting training fold 6.


	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8045, Val Rows: 894
Fitting 1 L1 models ...
Fitting model: RandomForest ...
	0.6942	 = Validation score   (r2)
	12.94s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6942	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.83s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000513/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000529/"
Beginning AutoGluon training ...
AutoGluon will save models

Starting training fold 7.


Fitting 1 L1 models ...
Fitting model: RandomForest ...
	0.6866	 = Validation score   (r2)
	12.83s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6866	 = Validation score   (r2)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.97s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000529/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_000544/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_000544/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8945
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, st

Starting training fold 8.


	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically g

Starting training fold 9.


Fitting 1 L1 models ...
Fitting model: RandomForest ...
	0.8563	 = Validation score   (r2)
	13.05s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8563	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.86s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_000600/")


In [10]:
XT = cv({'XT': {}})
XT.to_csv("Data/output/CV_random/XT.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001218/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001218/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8866
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.79383, 7.7627)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11745.8 MB
	Train Data (Original)  Memory Usage: 1.28 MB (0.0% of available memory)
	Inferring data type of 

10
Starting training fold 0.


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon wi

Starting training fold 1.


Fitting 1 L1 models ...
Fitting model: ExtraTrees ...
	0.8255	 = Validation score   (r2)
	4.04s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8255	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 5.76s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_001225/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001231/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001231/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8891
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev)

Starting training fold 2.


Fitting 1 L1 models ...
Fitting model: ExtraTrees ...
	0.7407	 = Validation score   (r2)
	4.34s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7407	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.19s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_001231/")


Starting training fold 3.


No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001238/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001238/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8893
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.85688, 7.87431)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11743.96 MB
	Train Data (Original)  Memory Usage: 1.28 MB (0.0% of available memory)
	Inferring data type o

Starting training fold 4.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify

Starting training fold 5.


Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8019, Val Rows: 892
Fitting 1 L1 models ...
Fitting model: ExtraTrees ...
	0.7748	 = Validation score   (r2)
	4.26s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7748	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.28s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_001253/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001300/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001300/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8923
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtyp

Starting training fold 6.


Fitting model: ExtraTrees ...
	0.8167	 = Validation score   (r2)
	4.15s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8167	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.1s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_001300/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001307/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001307/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8926
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.84271, 7

Starting training fold 7.


Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8033, Val Rows: 893
Fitting 1 L1 models ...
Fitting model: ExtraTrees ...
	0.6347	 = Validation score   (r2)
	4.56s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.6347	 = Validation score   (r2)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.72s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_001307/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_001314/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_001314/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux


Starting training fold 8.


Train Data Rows:    8942
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.82815, 7.77626)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11745.18 MB
	Train Data (Original)  Memory Usage: 1.29 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Sta

Starting training fold 9.


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing

In [11]:
KNN = cv({'KNN': {}})
KNN.to_csv("Data/output/CV_random/KNN.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002239/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002239/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8873
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.82078, 7.78391)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11747.93 MB
	Train Data (Original)  Memory Usage: 1.28 MB (0.0% of available memory)
	Inferring data type o

10
Starting training fold 0.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify

Starting training fold 1.


	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify 

Starting training fold 2.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.1538	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.1538	 = Validation score   (r2)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.5s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002240-001/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002241/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002241/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8897
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, std

Starting training fold 3.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.1788	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.1788	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.48s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002241/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002242/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002242/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8902
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev)

Starting training fold 4.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.0881	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.0881	 = Validation score   (r2)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.47s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002242/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002243/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002243/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8912
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev

Starting training fold 5.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.1821	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.1821	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.47s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002243/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002243-001/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002243-001/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8914
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean,

Starting training fold 6.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.1545	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.1545	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.47s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002243-001/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002244/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002244/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8917
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, std

Starting training fold 7.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.1071	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.1071	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.47s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002244/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002245/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002245/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8946
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev)

Starting training fold 8.


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.04 MB (0.0% of available memory)
Data preprocessing

Starting training fold 9.


Fitting 1 L1 models ...
Fitting model: KNeighbors ...
	0.113	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.113	 = Validation score   (r2)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 0.51s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002246/")


In [12]:
AutoGluon = cv('default')
AutoGluon.to_csv("Data/output/CV_random/AutoGluon.csv", index = False)

No path specified. Models will be saved in: "AutogluonModels/ag-20220425_002306/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_002306/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8852
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.868, 7.89958)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11744.15 MB
	Train Data (Original)  Memory Usage: 1.27 MB (0.0% of available memory)
	Inferring data type of 

10
Starting training fold 0.


			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon wil

[1000]	valid_set's l2: 20.811	valid_set's r2: 0.791245
[2000]	valid_set's l2: 20.3251	valid_set's r2: 0.795398
[3000]	valid_set's l2: 20.1436	valid_set's r2: 0.798475


	0.7988	 = Validation score   (r2)
	6.18s	 = Training   runtime
	0.17s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 17.4957	valid_set's r2: 0.823328


	0.829	 = Validation score   (r2)
	2.23s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.738	 = Validation score   (r2)
	12.23s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.8123	 = Validation score   (r2)
	124.74s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.7364	 = Validation score   (r2)
	3.82s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.7372	 = Validation score   (r2)
	14.52s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	0.7077	 = Validation score   (r2)
	6.27s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.7506	 = Validation score   (r2)
	37.32s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.7881	 = Validation score   (r2)
	4.56s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: WeightedEnsembl

Starting training fold 1.


Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.0697	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.0809	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 20.5463	valid_set's r2: 0.761123


	0.7652	 = Validation score   (r2)
	3.51s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 19.2265	valid_set's r2: 0.776198


	0.7774	 = Validation score   (r2)
	3.41s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.6973	 = Validation score   (r2)
	12.23s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.7843	 = Validation score   (r2)
	125.54s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.6912	 = Validation score   (r2)
	3.94s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.7212	 = Validation score   (r2)
	7.33s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	0.7438	 = Validation score   (r2)
	8.05s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.7246	 = Validation score   (r2)
	42.93s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 22.7508	valid_set's r2: 0.735984


	0.7371	 = Validation score   (r2)
	4.52s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7992	 = Validation score   (r2)
	0.34s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 218.23s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_002645/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_003023/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_003023/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8902
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.81366, 7.76163)
	If 'regression' 

Starting training fold 2.


Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8011, Val Rows: 891
Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.2341	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.2296	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 11.6303	valid_set's r2: 0.797507
[2000]	valid_set's l2: 10.6805	valid_set's r2: 0.812364
[3000]	valid_set's l2: 10.5582	valid_set's r2: 0.815799
[4000]	valid_set's l2: 10.5515	valid_set's r2: 0.816482
[5000]	valid_set's l2: 10.5503	valid_set's r2: 0.814307


	0.817	 = Validation score   (r2)
	9.94s	 = Training   runtime
	0.27s	 = Validation runtime
Fitting model: LightGBM ...
	0.8218	 = Validation score   (r2)
	1.63s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.7445	 = Validation score   (r2)
	12.43s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.8683	 = Validation score   (r2)
	126.34s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.7654	 = Validation score   (r2)
	3.92s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.6578	 = Validation score   (r2)
	7.52s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	0.8354	 = Validation score   (r2)
	22.03s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.7121	 = Validation score   (r2)
	41.78s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 11.4363	valid_set's r2: 0.801225


	0.8014	 = Validation score   (r2)
	6.34s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8683	 = Validation score   (r2)
	0.35s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 239.24s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_003023/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_003423/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_003423/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8903
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.83501, 7.70698)
	If 'regression' 

Starting training fold 3.


Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8012, Val Rows: 891
Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.1441	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.1547	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 10.4716	valid_set's r2: 0.769599
[2000]	valid_set's l2: 9.6924	valid_set's r2: 0.787498
[3000]	valid_set's l2: 9.43439	valid_set's r2: 0.793881
[4000]	valid_set's l2: 9.34752	valid_set's r2: 0.797143
[5000]	valid_set's l2: 9.2591	valid_set's r2: 0.796149
[6000]	valid_set's l2: 9.23123	valid_set's r2: 0.797904
[7000]	valid_set's l2: 9.20435	valid_set's r2: 0.798851
[8000]	valid_set's l2: 9.18818	valid_set's r2: 0.796986


	0.8	 = Validation score   (r2)
	15.07s	 = Training   runtime
	0.42s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 7.8946	valid_set's r2: 0.828297
[2000]	valid_set's l2: 7.72447	valid_set's r2: 0.830807


	0.8326	 = Validation score   (r2)
	5.49s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.7265	 = Validation score   (r2)
	12.43s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.842	 = Validation score   (r2)
	124.91s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.7601	 = Validation score   (r2)
	3.81s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.6446	 = Validation score   (r2)
	7.29s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	0.8384	 = Validation score   (r2)
	13.38s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.649	 = Validation score   (r2)
	31.79s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 14.2503	valid_set's r2: 0.686461


	0.6904	 = Validation score   (r2)
	5.37s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8508	 = Validation score   (r2)
	0.36s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 227.92s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_003423/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_003812/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_003812/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8907
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.781, 7.75658)
	If 'regression' is 

Starting training fold 4.


Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.0719	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.0813	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...
	0.8199	 = Validation score   (r2)
	1.73s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 8.9411	valid_set's r2: 0.836868


	0.8418	 = Validation score   (r2)
	2.29s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.8056	 = Validation score   (r2)
	12.25s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.7601	 = Validation score   (r2)
	2.19s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.8199	 = Validation score   (r2)
	4.12s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.7638	 = Validation score   (r2)
	7.21s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	0.7462	 = Validation score   (r2)
	2.11s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.747	 = Validation score   (r2)
	41.42s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.8508	 = Validation score   (r2)
	2.33s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L

Starting training fold 5.


Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.1736	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.1851	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 12.2004	valid_set's r2: 0.742543
[2000]	valid_set's l2: 11.4764	valid_set's r2: 0.756466
[3000]	valid_set's l2: 11.3374	valid_set's r2: 0.760231
[4000]	valid_set's l2: 11.3056	valid_set's r2: 0.761966
[5000]	valid_set's l2: 11.2677	valid_set's r2: 0.760954


	0.7636	 = Validation score   (r2)
	10.11s	 = Training   runtime
	0.27s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 9.10498	valid_set's r2: 0.807711
[2000]	valid_set's l2: 8.68401	valid_set's r2: 0.814965
[3000]	valid_set's l2: 8.75849	valid_set's r2: 0.8159


	0.8166	 = Validation score   (r2)
	7.02s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.7573	 = Validation score   (r2)
	12.33s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.841	 = Validation score   (r2)
	126.59s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.7334	 = Validation score   (r2)
	4.02s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.7314	 = Validation score   (r2)
	7.55s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: XGBoost ...
	0.8198	 = Validation score   (r2)
	7.54s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.6675	 = Validation score   (r2)
	55.46s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 8.0522	valid_set's r2: 0.829563


	0.8315	 = Validation score   (r2)
	6.64s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.848	 = Validation score   (r2)
	0.37s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 245.69s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_003934/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_004340/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_004340/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8916
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.80922, 7.70471)
	If 'regression' i

Starting training fold 6.


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])   :  5 | ['Day', 'Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 13 | ['Lat', 'Lon', 'Elevation', 'Emissions', 'Roads', ...]
		('int', [])       :  1 | ['Day']
		('int', ['bool']) :  4 | ['Forest', 'Plumes_High', 'Plumes_Med', 'Plumes_Low']
	0.1s = Fit runtime
	18 features in original data used to generate 18 features in processed data.
	Train Data (Processed) Memory Usage: 1.03 MB (0.0% of available memory)
Data preprocessing

[1000]	valid_set's l2: 10.8303	valid_set's r2: 0.694413
[2000]	valid_set's l2: 10.6258	valid_set's r2: 0.701128


	0.7039	 = Validation score   (r2)
	3.8s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBM ...
	0.7226	 = Validation score   (r2)
	1.42s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.6647	 = Validation score   (r2)
	12.43s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.7989	 = Validation score   (r2)
	125.69s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.6762	 = Validation score   (r2)
	3.92s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.6444	 = Validation score   (r2)
	7.25s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	0.7656	 = Validation score   (r2)
	8.79s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.5521	 = Validation score   (r2)
	27.85s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.

Starting training fold 7.


	0.1048	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.1226	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 26.8928	valid_set's r2: 0.673083
[2000]	valid_set's l2: 25.5296	valid_set's r2: 0.68895
[3000]	valid_set's l2: 25.0982	valid_set's r2: 0.694048
[4000]	valid_set's l2: 24.867	valid_set's r2: 0.696782
[5000]	valid_set's l2: 24.7213	valid_set's r2: 0.698945
[6000]	valid_set's l2: 24.5957	valid_set's r2: 0.698765
[7000]	valid_set's l2: 24.5591	valid_set's r2: 0.699902
[8000]	valid_set's l2: 24.5453	valid_set's r2: 0.70053
[9000]	valid_set's l2: 24.5181	valid_set's r2: 0.701328
[10000]	valid_set's l2: 24.5039	valid_set's r2: 0.700418


	0.7021	 = Validation score   (r2)
	17.66s	 = Training   runtime
	0.57s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 20.4796	valid_set's r2: 0.749425


	0.7516	 = Validation score   (r2)
	2.39s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.6104	 = Validation score   (r2)
	12.43s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.7515	 = Validation score   (r2)
	125.21s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.6349	 = Validation score   (r2)
	3.92s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.5997	 = Validation score   (r2)
	7.62s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	0.7473	 = Validation score   (r2)
	6.68s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.6418	 = Validation score   (r2)
	33.75s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 22.1706	valid_set's r2: 0.728123


	0.7304	 = Validation score   (r2)
	5.3s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7614	 = Validation score   (r2)
	0.32s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 224.14s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_004701/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_005045/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_005045/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8949
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.83523, 7.64313)
	If 'regression' i

Starting training fold 8.


	0.0794	 = Validation score   (r2)
	0.03s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.0928	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 23.0924	valid_set's r2: 0.728313
[2000]	valid_set's l2: 21.4207	valid_set's r2: 0.748375
[3000]	valid_set's l2: 20.8853	valid_set's r2: 0.752665
[4000]	valid_set's l2: 20.6945	valid_set's r2: 0.754253
[5000]	valid_set's l2: 20.6366	valid_set's r2: 0.757115
[6000]	valid_set's l2: 20.6134	valid_set's r2: 0.756899


	0.7571	 = Validation score   (r2)
	11.97s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 21.6704	valid_set's r2: 0.741906
[2000]	valid_set's l2: 21.1252	valid_set's r2: 0.750576


	0.7513	 = Validation score   (r2)
	4.36s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.6671	 = Validation score   (r2)
	12.33s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.7698	 = Validation score   (r2)
	124.63s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.6746	 = Validation score   (r2)
	3.91s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.7273	 = Validation score   (r2)
	7.16s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	0.774	 = Validation score   (r2)
	7.68s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.6274	 = Validation score   (r2)
	32.44s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 19.6729	valid_set's r2: 0.767428


	0.7682	 = Validation score   (r2)
	5.61s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.7791	 = Validation score   (r2)
	0.34s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 217.86s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_005045/")
No path specified. Models will be saved in: "AutogluonModels/ag-20220425_005424/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220425_005424/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    8965
Train Data Columns: 18
Label Column: PM
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (199.1, 0.1, 8.79708, 7.73206)
	If 'regression' i

Starting training fold 9.


Fitting 11 L1 models ...
Fitting model: KNeighborsUnif ...
	0.1788	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: KNeighborsDist ...
	0.1709	 = Validation score   (r2)
	0.02s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: LightGBMXT ...


[1000]	valid_set's l2: 9.26583	valid_set's r2: 0.790092
[2000]	valid_set's l2: 8.52056	valid_set's r2: 0.808072
[3000]	valid_set's l2: 8.2473	valid_set's r2: 0.815069


	0.8162	 = Validation score   (r2)
	6.97s	 = Training   runtime
	0.19s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 6.84598	valid_set's r2: 0.845693


	0.8476	 = Validation score   (r2)
	2.85s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.7881	 = Validation score   (r2)
	12.23s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: CatBoost ...
	0.8637	 = Validation score   (r2)
	124.34s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.7663	 = Validation score   (r2)
	3.92s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.6329	 = Validation score   (r2)
	7.34s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	0.8491	 = Validation score   (r2)
	14.01s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.598	 = Validation score   (r2)
	33.6s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 6.7624	valid_set's r2: 0.845632


	0.8492	 = Validation score   (r2)
	5.67s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.8708	 = Validation score   (r2)
	0.34s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 217.85s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220425_005424/")


# Consolidate dataframes

In [13]:
small_ids = pd.read_csv("Data/datasets/Small_Ids.csv")['x']
small_ids = [float(i) for i in small_ids]

full_df = pd.DataFrame(columns = ['Model', 'test_area', 'station', 'Day', 'PM', 'PM_pred'])

paths = glob.glob("Data/output/CV_random/*.csv")

for path in paths:
  f = os.path.split(path)[1]
  Model = os.path.splitext(f)[0]
  Table = pd.read_csv(path)
  Table['test_area'] = Table['station'].isin(small_ids)
  Table['Model'] = Model

  full_df = full_df.append(Table, ignore_index = True)

full_df.to_csv("Data/output/all_cv_preds_random.csv", index = False)