## Goal
As a first pass at developing a machine learning model to predict California wildfires I will evaluate and tune several different gradient boosting algorithms. The procedure will be as follows:

1. Determine the best gradient boosting for the data
2. Determine the best scoring function/metric for optimization
3. Tune model hyperparameters
4. Investigate feature importance and possibly trim/apply dimensionality reduction techniques to the data

There are two anticipated issues which will need to be dealt with first:

1. Large dataset size - current working dataset has 7.3 million observations of 25 variables and this is likely to grow as the project progresses
2. Highly imbalanced data (~20 times more observations without fire than with)

Future goals are to add several more factors from various data sources including: elevation, population density, time since last fire and total fires.

**Imports and notebook setup**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math

from random import randint
from time import time
from statistics import mean
from scipy import stats
from sklearn.metrics import make_scorer
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from IPython.display import display_markdown
from scipy.interpolate import griddata

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

%matplotlib inline
sns.set_style("ticks")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.despine()

<Figure size 432x288 with 0 Axes>

**Variable definitions**

In [14]:
raw_data_file = '../data/training_data/1992-1997_training_data.csv'
rolling_window_data_file = '../data/training_data/1992-1997_training_data_rolling_window.csv'
daily_mean_data_file = '../data/training_data/1992-1997_training_data_daily_mean.csv'
rand_seed = 123

data_sample_size = int(7300000 * 0.01)

test_train_split_ratio = 0.3 
classifier_jobs = 15
optimization_jobs = 1
max_jobs = classifier_jobs * optimization_jobs
num_trials = 3
search_iterations = 500
search_scoring_func = make_scorer(average_precision_score)
plot_grid_resolution = 500
contourf_levels = 500

**Functions**

In [12]:
def k_random_sample(data, k):
    n = len(data)
    
    indices = random.sample(range(0, n), k)
    
    return data.iloc[indices]

def stratified_train_test_split(data, targets, rand_seed):
    sss = StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_train_split_ratio, 
        random_state=rand_seed
    )

    for train_index, test_index in sss.split(data, targets):
        x_train, x_test = data.iloc[train_index], data.iloc[test_index]
        y_train, y_test = targets.iloc[train_index], targets.iloc[test_index]
        
        return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

def sample_data(data, data_sample_size):

    # split positive and negative datsets up
    ignitions = data[data['ignition'] > 0]
    no_ignitions = data[data['ignition'] == 0]
    
    # due to the moving average we will have some positive observations with
    # fractional ignition values
    ignitions = ignitions.assign(ignition=1)
    
    # sample data
    no_ignitions_sample = k_random_sample(no_ignitions, data_sample_size)
    ignitions_sample = k_random_sample(ignitions, data_sample_size)

    # combine
    sampled_data = no_ignitions_sample.append(ignitions_sample)
    
    return sampled_data
    
def calc_false_neg_pos_rate(model, x_test, y_test):
    cm = confusion_matrix(y_test, model.predict(x_test))

    TN = cm[0][0]
    FN = cm[1][0]
    FP = cm[0][1]

    false_neg_rate = FN / (FN + TN)
    false_pos_rate = FP / (FP + TN)
    
    return false_neg_rate, false_pos_rate

def train_model(classifier, x_train, y_train):
    model = classifier
    model.fit(x_train, y_train)
    return model
    

def compare_algorithms(
    classifiers, 
    model_descriptions,
    num_trials,
    data_moving_avg,
    data_sample_size
):
    
    # Set up empty dataframe to hold resutlts
    model_scores_columns = [
        'Classifier',
        'n trials',
        'Sample size',
        'Time (min.)',
        'Peak memory (GB)',
        'Training score +/- SD',
        'Test score +/- SD',
        'False positive rate +/- SD',
        'False negative rate +/- SD'
    ]
    
    model_scores = pd.DataFrame(columns=model_scores_columns)    
    
    # Loop over the diffrent classifiers
    for classifier, description in zip(classifiers, model_descriptions):
        train_scores = []
        test_scores = []
        false_pos_rates = []
        false_neg_rates = []
        mem_usages = []
        
        start = time()
        
        for i in range(num_trials):
            # Resample and train-test split data
            sampled_data = sample_data(data_moving_avg, data_sample_size)
            targets = sampled_data['ignition']
            data = sampled_data.drop(['ignition'], axis=1)
            x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values)
        
            # Initalize and train model on sampled data
            mem_usage, model = memory_usage((train_model, (classifier, x_train, y_train)), retval=True)
            max_mem = max(mem_usage)
            mem_usages.append(max_mem)

            train_scores.append(average_precision_score(model.predict(x_train), y_train))
            test_scores.append(average_precision_score(model.predict(x_test), y_test))
            false_neg_rate, false_pos_rate = calc_false_neg_pos_rate(model, x_test, y_test)
            false_neg_rates.append(false_neg_rate)
            false_pos_rates.append(false_pos_rate)
            
        stop = time()
        dT = np.round(((stop - start)/60), 2)
        
        peak_memory = np.round((max(mem_usages)/1000), 5)
        
        avg_train_score = mean(train_scores)
        avg_test_score = mean(test_scores)
        avg_false_neg_rate = mean(false_neg_rates)
        avg_false_pos_rate = mean(false_pos_rates)
        
        std_train_score = np.std(train_scores)
        std_test_score = np.std(test_scores)
        std_false_neg_rate = np.std(false_neg_rates)
        std_false_pos_rate = np.std(false_pos_rates)
        
        print('{} trials with {} took {} min. on {} observations'.format(
            num_trials, 
            description, 
            dT,
            data_sample_size
        ))
        
        model_scores = model_scores.append(pd.Series([
            description,
            num_trials,
            data_sample_size,
            dT,
            peak_memory,
            '{:.2f} +/- {:.3f}'.format(np.round(avg_train_score,2), np.round(std_train_score,3)),
            '{:.2f} +/- {:.3f}'.format(np.round(avg_test_score,2), np.round(std_test_score,3)),
            '{:.2f} +/- {:.3f}'.format(np.round(avg_false_pos_rate,2), np.round(std_false_pos_rate,3)),
            '{:.2f} +/- {:.3f}'.format(np.round(avg_false_neg_rate,2), np.round(std_false_neg_rate,3)),
        ], index=model_scores.columns), ignore_index=True)
        
        i += 1
        
    return model_scores
    
def test_scoring_functions(
    num_trials,
    scoring_functions, 
    data_moving_avg,
    data_sample_size,
    max_jobs
):
    
    # Set up empty dataframe to hold resutlts
    model_scores_columns = [
        'Scoring function',
        'n',
        'Training score +/- STD',
        'Test score +/- STD',
        'False positive rate +/- STD',
        'False negative rate +/- STD'
    ]
    
    model_scores = pd.DataFrame(columns=model_scores_columns)

    for scoring_function in scoring_functions:
        train_scores = []
        test_scores = []
        false_pos_rates = []
        false_neg_rates = []
        
        for i in range(num_trials):
            # Resample and train-test split data
            sampled_data = sample_data(data_moving_avg, data_sample_size)
            targets = sampled_data['ignition']
            data = sampled_data.drop(['ignition'], axis=1)
            x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values)
            
            catboost_mod = CatBoostClassifier(
                thread_count = max_jobs,
                score_function = scoring_function
            )

            catboost_mod.fit(x_train, y_train, silent=True)

            train_scores.append(average_precision_score(catboost_mod.predict(x_train), y_train))
            test_scores.append(average_precision_score(catboost_mod.predict(x_test), y_test))
            false_neg_rate, false_pos_rate = calc_false_neg_pos_rate(catboost_mod, x_test, y_test)
            false_neg_rates.append(false_neg_rate)
            false_pos_rates.append(false_pos_rate)
            
        avg_train_score = mean(train_scores)
        avg_test_score = mean(test_scores)
        avg_false_neg_rate = mean(false_neg_rates)
        avg_false_pos_rate = mean(false_pos_rates)
        
        std_train_score = np.std(train_scores)
        std_test_score = np.std(test_scores)
        std_false_neg_rate = np.std(false_neg_rates)
        std_false_pos_rate = np.std(false_pos_rates)
        
        model_scores = model_scores.append(pd.Series([
            scoring_function,
            num_trials,
            '{} +/- {}'.format(np.round(avg_train_score,2), np.round(std_train_score,3)),
            '{} +/- {}'.format(np.round(avg_test_score,2), np.round(std_test_score,3)),
            '{} +/- {}'.format(np.round(avg_false_pos_rate,2), np.round(std_false_pos_rate,3)),
            '{} +/- {}'.format(np.round(avg_false_neg_rate,2), np.round(std_false_neg_rate,3)),
        ], index=model_scores.columns), ignore_index=True)
        
        i += 1

    return model_scores

def plot_relative_feature_importance(model, data, x_test, x_tick_size):
    importances = catboost_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    feature_names = np.array(list(data))

    plt.figure(figsize=(20,10))
    plt.rc('axes', titlesize=30)     # fontsize of the axes title
    plt.rc('axes', labelsize=30)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=x_tick_size)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=25)    # fontsize of the tick labels
    plt.title("Feature importance")
    plt.bar(range(x_test.shape[1]), importances[indices],
           color="darkblue", align="center")
    plt.xticks(np.arange(len(indices)), feature_names[indices], rotation='vertical')
    plt.xlim([-1, x_test.shape[1]])
    plt.xlabel("Feature")
    plt.ylabel("Relative importance")
    
    plt.show()
    
def tune_class_weight(
    class_weights, 
    x_train, 
    y_train, 
    x_test, 
    y_test
):
    model_scores_columns = [
        'Class weight',
        'Training score',
        'Test score',
        'False positive rate',
        'False negative rate'
    ]
    
    model_scores = pd.DataFrame(columns=model_scores_columns)

    for class_weight in class_weights:
        catboost_mod = CatBoostClassifier(
            thread_count = (classifier_jobs * optimization_jobs),
            score_function = 'Cosine',
            scale_pos_weight = class_weight
        )

        catboost_mod.fit(x_train, y_train, silent=True)

        training_score = average_precision_score(catboost_mod.predict(x_train), y_train)
        test_score = average_precision_score(catboost_mod.predict(x_test), y_test)
        false_neg_rate, false_pos_rate = calc_false_neg_pos_rate(catboost_mod, x_test, y_test)
        model_scores = model_scores.append(pd.Series([class_weight, np.round(training_score,2), np.round(test_score,2), np.round(false_pos_rate,2), np.round(false_neg_rate,2)], index=model_scores.columns), ignore_index=True)

    return model_scores

def plot_class_weight_tuning_results(model_scores):
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

    plt.subplots(1,2,figsize=(12,5))
    
    plt.subplot(1, 2, 1)
    
    plt.scatter(
        np.log10(model_scores['Class weight']), 
        model_scores['False positive rate'], 
        s=20, 
        c='darkblue', 
        label='False positive'
    )
    
    plt.scatter(
        np.log10(model_scores['Class weight']), 
        model_scores['False negative rate'], 
        s=20, 
        c='darkred', 
        label='False negative'
    )
    
    plt.legend(loc='upper left')
    plt.xlabel("Log 10 class weight")
    plt.ylabel("Rate")
    plt.title("Class weight and false prediction rates")

    plt.subplot(1, 2, 2)
    
    plt.scatter(
        np.log10(model_scores['Class weight']), 
        model_scores['Training score'], 
        s=20, 
        c='darkblue', 
        label='Training'
    )
    
    plt.scatter(
        np.log10(model_scores['Class weight']),
        model_scores['Test score'],
        s=20, 
        c='darkred', 
        label='Test'
    )
    
    plt.legend(loc='lower right');
    plt.xlabel("Log 10 class weight")
    plt.ylabel("Score")
    plt.title("Class weight and precision-recall score")

    plt.tight_layout()
    plt.show()
    

def train_catboost_with_sampling(
    num_trials,
    known_params,
    data_moving_avg,
    data_sample_size,
    max_jobs
):
    
    # Set up empty dataframe to hold resutlts
    model_scores_columns = [
        'Scoring function',
        'n',
        'Training score +/- STD',
        'Test score +/- STD',
        'False positive rate +/- STD',
        'False negative rate +/- STD'
    ]
    
    model_scores = pd.DataFrame(columns=model_scores_columns)

    train_scores = []
    test_scores = []
    false_pos_rates = []
    false_neg_rates = []
        
    for i in range(num_trials):
        # Resample and train-test split data
        sampled_data = sample_data(data_moving_avg, data_sample_size)
        targets = sampled_data['ignition']
        data = sampled_data.drop(['ignition'], axis=1)
        x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values)

        catboost_mod = CatBoostClassifier(**known_params)
        catboost_mod.fit(x_train, y_train, silent=True)

        train_scores.append(average_precision_score(catboost_mod.predict(x_train), y_train))
        test_scores.append(average_precision_score(catboost_mod.predict(x_test), y_test))
        false_neg_rate, false_pos_rate = calc_false_neg_pos_rate(catboost_mod, x_test, y_test)
        false_neg_rates.append(false_neg_rate)
        false_pos_rates.append(false_pos_rate)
        
        i += 1
            
    avg_train_score = mean(train_scores)
    avg_test_score = mean(test_scores)
    avg_false_neg_rate = mean(false_neg_rates)
    avg_false_pos_rate = mean(false_pos_rates)

    sem_train_score = stats.sem(train_scores)
    sem_test_score = stats.sem(test_scores)
    sem_false_neg_rate = stats.sem(false_neg_rates)
    sem_false_pos_rate = stats.sem(false_pos_rates)

    model_scores = model_scores.append(pd.Series([
        'Catboost with scoring function & class weight',
        num_trials,
        '{} +/- {}'.format(np.round(avg_train_score,2), np.round(sem_train_score,3)),
        '{} +/- {}'.format(np.round(avg_test_score,2), np.round(sem_test_score,3)),
        '{} +/- {}'.format(np.round(avg_false_pos_rate,2), np.round(sem_false_pos_rate,3)),
        '{} +/- {}'.format(np.round(avg_false_neg_rate,2), np.round(sem_false_neg_rate,3)),
    ], index=model_scores.columns), ignore_index=True)

    return model_scores

def tune_hyperparameters(
    known_params,
    param_dist, 
    x_train, 
    y_train, 
    num_jobs, 
    search_iterations, 
    search_scoring_func
):

    # initalize catboost classifier
    model = CatBoostClassifier(**known_params)

    # set up random search
    random_search = RandomizedSearchCV(
        model, 
        param_distributions=param_dist,
        scoring=search_scoring_func,
        n_iter=search_iterations,
        n_jobs=num_jobs
    )

    # run and time search
    start = time()
    best_model = random_search.fit(x_train, y_train)
    print("RandomizedSearchCV took %.f min. for %d candidate"
          " parameter settings." % (((time() - start)/60), search_iterations))
    
    return best_model, random_search

def regularize_grid(x, y, z, resolution):

    # target grid to interpolate to
    xi = np.arange(min(x), max(x), ((max(x) - min(x)) / resolution))
    yi = np.arange(min(y), max(y), ((max(y) - min(y)) / resolution))
    xi, yi = np.meshgrid(xi, yi)

    # interpolate
    zi = griddata((x, y), z, (xi, yi), method='linear')
    
    return xi, yi, zi

## Data preparation

In [16]:
raw_data.head()

Unnamed: 0,weather_bin_time,lat,lon,air.sfc,air.2m,apcp,crain,rhum.2m,dpt.2m,pres.sfc,...,hcdc,mcdc,hpbl,prate,vis,ulwrf.sfc,fire_discovery_time,size,size_class,ignition
0,1992-01-01 00:00:00,40.29749,-124.3408,284.35388,284.81824,0.0,0.0,88.3125,282.8904,100173.56,...,11.0,0.0,533.4578,-3e-06,20007.979,359.9375,,,,
1,1992-01-01 00:00:00,38.96153,-123.5579,285.66638,286.25574,0.0,0.0,67.9375,280.4373,99073.56,...,6.0,0.0,671.8578,-3e-06,20007.979,353.875,,,,
2,1992-01-01 00:00:00,39.2415,-123.6393,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99573.56,...,6.0,0.0,786.9578,-3e-06,20007.979,349.4375,,,,
3,1992-01-01 00:00:00,39.52163,-123.7215,286.04138,286.81824,0.0,0.0,69.8125,281.35916,99473.56,...,6.0,0.0,917.4578,-3e-06,20007.979,349.4375,,,,
4,1992-01-01 00:00:00,39.80193,-123.8045,283.35388,284.06824,0.0,0.0,68.8125,278.55447,96573.56,...,6.0,0.0,833.7578,-3e-06,20007.979,341.625,,,,


In [18]:
raw_data = pd.read_csv(raw_data_file, low_memory=False)
raw_data['ignition'].fillna(0, inplace=True)
raw_data['weather_bin_time'] = pd.to_datetime(raw_data['weather_bin_time'])
raw_data['weather_bin_day'] = raw_data['weather_bin_time'].dt.day
raw_data['weather_bin_month'] = raw_data['weather_bin_time'].dt.month
raw_data['weather_bin_year'] = raw_data['weather_bin_time'].dt.year
raw_data.drop(['fire_discovery_time', 'size', 'size_class', 'weather_bin_time'], axis=1, inplace=True)
raw_data.dropna(inplace=True)

data_rolling_window = pd.read_csv(rolling_window_data_file, low_memory=False)

data_daily_mean = pd.read_csv(daily_mean_data_file, low_memory=False)

## Baseline: default and oracle models

Goal here is to establish upper and lower bounds on what is possible with our dataset. We will do this by creating two models:

1. 'Oracle' - best possible performance, train directly on the test set with no regularization
2. Default - predict major class for all test set observations

These two models will give us a context in which to evaluate how well we are doing. At the same time we will also evaluate three diffrent versions of our dataset.

1. Raw data
2. Data averaged with a 24 hr rolling mean
3. Data averaged by day

## Classifier selection: kitchen sink approach

First step is to throw a bunch of different classifiers with default settings at the problem and see how they do. See list below for contenders.

In [8]:
# define classifiers to test
model_descriptions = [
    'XGBoost',
    'CatBoost',
    'LightGBM',
    'Rand. forest',
    "Linear SVM",
    "RBF SVM",
    "Gaussian Proc.",
    "Decision Tree",
    "AdaBoost",
    "Naive Bayes",
    "QDA"
]

classifiers = (
    XGBClassifier(n_jobs = max_jobs), 
    CatBoostClassifier(thread_count = max_jobs, silent = True), 
    LGBMClassifier(n_jobs = max_jobs),
    RandomForestClassifier(n_jobs = max_jobs),
    SVC(kernel = "linear"),
    SVC(),
    GaussianProcessClassifier(n_jobs = max_jobs),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GaussianNB(), #failed on 73000 observations: Unable to allocate array with shape (109500, 109500) and data type float64
    QuadraticDiscriminantAnalysis()
)

Start with a small sample of the data to quickly get a sense of how well each classifier works. Hopefully we can discard some to make test run times shorter in the next round.

In [9]:
from memory_profiler import memory_usage

# start with a smaller sample to quickly get an inital sense of how well 
# each classifier works
data_sample_size = 5000
num_trials = 50

# repeatedly test each classifier on new samples of the data
model_score_comparision = compare_algorithms(
    classifiers, 
    model_descriptions,
    num_trials,
    data_moving_avg,
    data_sample_size
)

model_score_comparision = model_score_comparision.sort_values(
    ['False negative rate +/- SD', 'False positive rate +/- SD'], 
    ascending=[1, 1]
)

model_score_comparision

50 trials with XGBoost took 0.95 min. on 5000 observations
50 trials with CatBoost took 5.64 min. on 5000 observations
50 trials with LightGBM took 0.98 min. on 5000 observations
50 trials with Rand. forest took 1.52 min. on 5000 observations
50 trials with Linear SVM took 34.97 min. on 5000 observations
50 trials with RBF SVM took 7.52 min. on 5000 observations
50 trials with Gaussian Proc. took 42.57 min. on 5000 observations
50 trials with Decision Tree took 0.97 min. on 5000 observations
50 trials with AdaBoost took 1.68 min. on 5000 observations
50 trials with Naive Bayes took 0.8 min. on 5000 observations
50 trials with QDA took 0.81 min. on 5000 observations


Unnamed: 0,Classifier,n trials,Sample size,Time (min.),Peak memory (GB),Training score +/- SD,Test score +/- SD,False positive rate +/- SD,False negative rate +/- SD
1,CatBoost,50,5000,5.64,2.88154,0.91 +/- 0.005,0.78 +/- 0.013,0.22 +/- 0.014,0.17 +/- 0.014
2,LightGBM,50,5000,0.98,2.88854,0.90 +/- 0.005,0.77 +/- 0.010,0.22 +/- 0.014,0.18 +/- 0.011
0,XGBoost,50,5000,0.95,2.83508,0.79 +/- 0.007,0.76 +/- 0.008,0.26 +/- 0.012,0.19 +/- 0.011
3,Rand. forest,50,5000,1.52,2.92854,1.00 +/- 0.000,0.76 +/- 0.013,0.23 +/- 0.015,0.19 +/- 0.013
10,QDA,50,5000,0.81,3.39432,0.80 +/- 0.007,0.80 +/- 0.009,0.45 +/- 0.023,0.19 +/- 0.013
5,RBF SVM,50,5000,7.52,2.92833,0.92 +/- 0.004,0.92 +/- 0.005,0.80 +/- 0.011,0.19 +/- 0.022
8,AdaBoost,50,5000,1.68,3.3915,0.76 +/- 0.008,0.75 +/- 0.012,0.28 +/- 0.013,0.20 +/- 0.014
9,Naive Bayes,50,5000,0.8,3.3915,0.79 +/- 0.005,0.79 +/- 0.010,0.49 +/- 0.016,0.23 +/- 0.017
4,Linear SVM,50,5000,34.97,2.92833,0.71 +/- 0.010,0.70 +/- 0.013,0.32 +/- 0.014,0.25 +/- 0.015
7,Decision Tree,50,5000,0.97,3.39159,1.00 +/- 0.000,0.66 +/- 0.012,0.28 +/- 0.016,0.28 +/- 0.015


Looks like we definitely do not want Gaussian process. We should also probably exclude the SVM based algorithms on the grounds of performance (RBF) and compute time (linear). It's tempting to drop decision trees as well due to likely overfitting, but it might be possible to ameliorate that at a later phase. 

In [10]:
# exclude algorithms
model_descriptions.remove('Gaussian Process')
model_descriptions.remove('Linear SVM')
model_descriptions.remove('RBF SVM')

classifiers = (
    XGBClassifier(n_jobs = max_jobs), 
    CatBoostClassifier(thread_count = max_jobs, silent = True), 
    LGBMClassifier(n_jobs = max_jobs),
    RandomForestClassifier(n_jobs = max_jobs),
    #SVC(kernel = "linear"),
    #SVC(),
    #GaussianProcessClassifier(n_jobs = max_jobs),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GaussianNB(), #failed on 73000 observations: Unable to allocate array with shape (109500, 109500) and data type float64
    QuadraticDiscriminantAnalysis()
)

ValueError: list.remove(x): x not in list

Before we make a choice, let's see how well the classifiers scale in terms of compute time and memory usage as the dataset gets larger. We don't want to devote time and energy to optimizing a classifier which won't be able to handle the whole dataset.

In [None]:
# set sample size and number of trials per round
data_sample_size = 1000
num_trials = 1

# Set up empty dataframe to hold results
increasing_n_results_columns = [
    'Classifier',
    'n trials',
    'Sample size',
    'Time (min.)',
    'Peak memory (GB)',
    'Training score +/- SD',
    'Test score +/- SD',
    'False positive rate +/- SD',
    'False negative rate +/- SD'
]

increasing_n_results = pd.DataFrame(columns=increasing_n_results_columns)

for i in range(11):
    
    # repeatedly test each classifier on new samples of the data
    model_score_comparision = compare_algorithms(
        classifiers, 
        model_descriptions,
        num_trials,
        data_moving_avg,
        data_sample_size
    )
    
    # double sample size
    data_sample_size = data_sample_size * 2
    
    increasing_n_results = increasing_n_results.append(model_score_comparision)

In [None]:
ax = sns.lineplot(x='Sample size', y='Time (min.)', hue='Classifier', data=increasing_n_results)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
ax = sns.lineplot(x='Sample size', y='Peak memory (GB)', hue='Classifier', data=increasing_n_results)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

In [None]:
model_score_history_columns = [
    'Description',
    'n',
    'Training score +/- STD',
    'Test score +/- STD',
    'False positive rate +/- STD',
    'False negative rate +/- STD'
]

model_score_history = pd.DataFrame(columns=model_score_history_columns)

model_score_history = model_score_history.append(pd.Series([
    'Default CatBoost model',
    num_trials,
    model_scores.iloc[1]['Training score +/- STD'], 
    model_scores.iloc[1]['Test score +/- STD'], 
    model_scores.iloc[1]['False positive rate +/- STD'], 
    model_scores.iloc[1]['False negative rate +/- STD'],
], index=model_score_history.columns), ignore_index=True)

model_score_history

## CatBoost scoring function selection

In [None]:
# note: CatBoost only has cosine and L2 avalible for non GPU computation
scoring_functions = [
#    'SolarL2',
     'Cosine', 
#     'NewtonL2', 
#     'NewtonCosine', 
#     'LOOL2', 
#     'SatL2', 
     'L2'
]

model_scores = test_scoring_functions(
    num_trials,
    scoring_functions, 
    data_moving_avg,
    data_sample_size,
    max_jobs
)

model_scores

Ok, looks similar, the Cosine false negative rate is slightly lower on some trials, so let's go with that. Before moving on to hyperparameter optimization, let's take a look at our relative feature importances and see if we can simplify the model any.

Before we move on, update out hyperparameter dictionary with our new finding and save the score results to a dataframe so we can track our progress.

In [None]:
known_params = {
    'random_state': rand_seed,
    'thread_count': max_jobs,
    'score_function': 'Cosine',
    'silent': True
}

model_score_history = model_score_history.append(pd.Series([
    'CatBoost model with Cosine scoring function',
    num_trials,
    model_scores.iloc[0]['Training score +/- STD'], 
    model_scores.iloc[0]['Test score +/- STD'], 
    model_scores.iloc[0]['False positive rate +/- STD'], 
    model_scores.iloc[0]['False negative rate +/- STD'],
], index=model_score_history.columns), ignore_index=True)

model_score_history

## Feature importance

In [None]:
# Retrain catboost model on new train-test split with new hyperparameters
sampled_data = sample_data(data_moving_avg, data_sample_size)
targets = sampled_data['ignition']
data = sampled_data.drop(['ignition'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values)
          
catboost_model = CatBoostClassifier(**known_params)
catboost_model.fit(x_train, y_train)

plot_relative_feature_importance(catboost_model, data, x_test, 20)

Interesting, some features which were expected to be important are fairly far down on the list. For example: air temperature at 2 meters and u/v components of wind speed. Before we go throwing variables away or trying to reduce dimensionality let's do some hyperparameter optimization and see if the relative feature importances change.

## Hyperparameter optimization: class weight
We sampled our data so we have the same number of positive and negative results so we may not need a class weight. Let's try a few values and see how if effects our false positive and false negative rates.

In [None]:
class_weights = np.logspace(-2.5, 2.5, num=25, base=10)

model_scores = tune_class_weight(
    class_weights, 
    x_train, 
    y_train, 
    x_test, 
    y_test
)

plot_class_weight_tuning_results(model_scores)

Interesting. We can use larger class weights to drive down the false negative rate at the expense of false positives. Looking at the clearly sigmoidal precision recall curve, the optimum value looks like it's around 1. Before just using no weight, let's try this again with a narrower range.

In [None]:
class_weights = np.logspace(-0.2, 0.2, num=25, base=10)

model_scores = tune_class_weight(
    class_weights, 
    x_train, 
    y_train, 
    x_test, 
    y_test
)

plot_class_weight_tuning_results(model_scores)

Wow, this one is actually pretty subjective... I'm calling it somewhere between one and three. Let's use 1.8 for now. This should give us ~90/85 precision-recall and put our false positive and false negative rates at 0.25 and ~0.1 respectively.

In [None]:
known_params = {
    'random_state': rand_seed,
    'thread_count': max_jobs,
    'score_function': 'Cosine',
    'silent': True,
    'scale_pos_weight': 1.78
}

**Note:** Important to keep in mind here that we can tune our false positive/false negative rates easily with this hyperparameter.

In [None]:
# Train model with new class weight
model_scores = train_catboost_with_sampling(
    num_trials,
    known_params,
    data_moving_avg,
    data_sample_size,
    max_jobs
)

In [None]:
model_score_history = model_score_history.append(pd.Series([
    'CatBoost with scoring function & class weight',
    num_trials,
    model_scores.iloc[0]['Training score +/- STD'], 
    model_scores.iloc[0]['Test score +/- STD'], 
    model_scores.iloc[0]['False positive rate +/- STD'], 
    model_scores.iloc[0]['False negative rate +/- STD'],
], index=model_score_history.columns), ignore_index=True)

model_score_history

## Hyperparameter tuning: learning rate and tree count

First, let's try taking one sample of the full dataset and then using RandomizedSearchCV to try and find the best values for learning rate and tree count.

In [None]:
param_dist = {
    'learning_rate': loguniform(0.0001, 1),
    'n_estimators': range(1,200)
}

# Resample and train-test split data
sampled_data = sample_data(data_moving_avg, data_sample_size)
targets = sampled_data['ignition']
data = sampled_data.drop(['ignition'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values) 

best_model, random_search = tune_hyperparameters(
    known_params,
    param_dist, 
    x_train, 
    y_train, 
    optimization_jobs, 
    search_iterations, 
    search_scoring_func
)

In [None]:
rand_search_results = pd.DataFrame(random_search.cv_results_).dropna()

x = rand_search_results['param_n_estimators']
y = rand_search_results['param_learning_rate']
z = rand_search_results['mean_test_score']
xi, yi, zi = regularize_grid(x, y, z, plot_grid_resolution)

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

plt.contourf(xi, yi, zi, contourf_levels, cmap=plt.cm.Blues)
plt.xlabel("N estimators")
plt.ylabel("Learning rate")
plt.title("Effect of estimator count and \nlearning rate on score")
plt.colorbar()
plt.show()

Surprised by how 'rough' hyperparameter space is. Let's keep the winning numbers and save the scores to our log.

In [None]:
best_learning_rate = rand_search_results.iloc[0]['param_learning_rate']
best_n_estimators = rand_search_results.iloc[0]['param_n_estimators']

known_params = {
    'random_state': rand_seed,
    'thread_count': classifier_jobs,
    'score_function': 'Cosine',
    'silent': True,
    'scale_pos_weight': 1.78,
    'learning_rate': best_learning_rate,
    'n_estimators': best_n_estimators
}

# Resample and train-test split data
sampled_data = sample_data(data_moving_avg, data_sample_size)
targets = sampled_data['ignition']
data = sampled_data.drop(['ignition'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values) 

# Train model with new hyperparameters
model_scores = train_catboost_with_sampling(
    num_trials,
    known_params,
    data_moving_avg,
    data_sample_size,
    max_jobs
)

# Add results to score history dataframe
model_score_history = model_score_history.append(pd.Series([
    'CatBoost with n estimators and learning rate',
    num_trials,
    model_scores.iloc[0]['Training score +/- STD'], 
    model_scores.iloc[0]['Test score +/- STD'], 
    model_scores.iloc[0]['False positive rate +/- STD'], 
    model_scores.iloc[0]['False negative rate +/- STD'],
], index=model_score_history.columns), ignore_index=True)

model_score_history

Now, just for fun, let's see how our winning values for number of estimators and learning rate vary across repeated trials with different sample of the data.

In [None]:
param_dist = {
    'learning_rate': loguniform(0.0001, 1),
    'n_estimators': range(1,200)
}

scores = []
learning_rates = []
n_estimators = []

itterations = 100

for i in range(itterations):
    # Resample and train-test split data
    sampled_data = sample_data(data_moving_avg, data_sample_size)
    targets = sampled_data['ignition']
    data = sampled_data.drop(['ignition'], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values) 
    
    best_model, random_search = tune_hyperparameters(
        known_params,
        param_dist, 
        x_train, 
        y_train, 
        optimization_jobs, 
        search_iterations, 
        search_scoring_func
    )
    
    rand_search_results = pd.DataFrame(random_search.cv_results_).dropna()
    winner = rand_search_results[rand_search_results['rank_test_score'] == 1]
    avg_score = winner.iloc[0]['mean_test_score']
    learning_rate = winner.iloc[0]['param_learning_rate']
    n_estimator = winner.iloc[0]['param_n_estimators']
    
    scores.append(avg_score)
    learning_rates.append(learning_rate)
    n_estimators.append(n_estimator)
    
    i += 1

In [None]:
x = n_estimators
y = learning_rates
z = scores
xi, yi, zi = regularize_grid(x, y, z, plot_grid_resolution)

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

plt.contourf(xi, yi, zi, contourf_levels, cmap=plt.cm.Blues)
plt.xlabel("N estimators")
plt.ylabel("Learning rate")
plt.title("Winning estimator count and\n learning rate vs score")
plt.colorbar()
plt.show()

## Hyperparameter tuning: tree depth and L2 coefficient

In [None]:
param_dist = {
    'depth': range(1, 21, 1),
    'l2_leaf_reg': np.linspace(0, 10, 101)
}

best_model, random_search = tune_hyperparameters(
    known_params,
    param_dist, 
    x_train, 
    y_train, 
    optimization_jobs, 
    search_iterations, 
    search_scoring_func
)

In [None]:
rand_search_results = pd.DataFrame(random_search.cv_results_).dropna()

x = rand_search_results['param_depth']
y = rand_search_results['param_l2_leaf_reg']
z = rand_search_results['mean_test_score']
xi, yi, zi = regularize_grid(x, y, z, plot_grid_resolution)

sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

plt.contourf(xi, yi, zi, contourf_levels, cmap=plt.cm.Blues)
plt.xlabel("Tree depth")
plt.ylabel("L2 coefficient")
plt.title("Effect of tree depth and \nL2 coefficient on score")
plt.colorbar()
plt.show()

Again, store winning parameters in dictionary and add score results to log.

In [None]:
best_depth = rand_search_results.iloc[0]['param_depth']
best_l2_leaf_reg = rand_search_results.iloc[0]['param_l2_leaf_reg']

known_params = {
    'random_state': rand_seed,
    'thread_count': classifier_jobs,
    'score_function': 'Cosine',
    'silent': True,
    'scale_pos_weight': 1.78,
    'learning_rate': best_learning_rate,
    'n_estimators': best_n_estimators,
    'depth': best_depth,
    'l2_leaf_reg': best_l2_leaf_reg
}

# Resample and train-test split data
sampled_data = sample_data(data_moving_avg, data_sample_size)
targets = sampled_data['ignition']
data = sampled_data.drop(['ignition'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(data.values, targets.values) 

# Train model with new hyperparameters
model_scores = train_catboost_with_sampling(
    num_trials,
    known_params,
    data_moving_avg,
    data_sample_size,
    max_jobs
)

# Add results to score history dataframe
model_score_history = model_score_history.append(pd.Series([
    'CatBoost with depth and L2 coefficient',
    num_trials,
    model_scores.iloc[0]['Training score +/- STD'], 
    model_scores.iloc[0]['Test score +/- STD'], 
    model_scores.iloc[0]['False positive rate +/- STD'], 
    model_scores.iloc[0]['False negative rate +/- STD'],
], index=model_score_history.columns), ignore_index=True)

model_score_history

## Final model: feature importance

In [None]:
best_model = CatBoostClassifier(**known_params)
best_model.fit(x_train, y_train)

plot_relative_feature_importance(catboost_model, data, x_test, 20)

## Final model: effect of feature count

In [None]:
train_scores = list()
test_scores = list()

feature_names = np.array(list(data))
importances = catboost_model.feature_importances_
indices = np.argsort(importances)[::-1]

for n in range(1,(len(feature_names) + 1)):
    # grab top n feature names
    top_n_features = feature_names[indices[0:n]]

    # rebuild training and test sets with feature subset
    sampled_data = sample_data(data_moving_avg, data_sample_size)
    targets = sampled_data['ignition']
    data = sampled_data.drop(['ignition'], axis=1)
    data_subset = data[top_n_features]
    x_train_subset, x_test_subset, y_train_subset, y_test_subset  = train_test_split(data_subset, targets)

    # instantiate and train classifier
    catboost_model = CatBoostClassifier(**known_params)
    catboost_model.fit(x_train_subset, y_train_subset)

    # report score for feature subset
    training_score = average_precision_score(catboost_model.predict(x_train_subset), y_train_subset)
    test_score = average_precision_score(catboost_model.predict(x_test_subset), y_test_subset)
    
    train_scores.append(training_score)
    test_scores.append(test_score)
    
    #print('Top {} features, precision-recall score train/test: {}/{}'.format(n,np.round(training_score,2),np.round(test_score,2)))

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

fig = plt.figure()
ax = fig.add_subplot(111)

ax.scatter(range(1,(len(feature_names) + 1)), train_scores, s=20, c='darkblue', label='Training data')
ax.scatter(range(1,(len(feature_names) + 1)), test_scores, s=20, c='darkred', label='Test data')
plt.legend(loc='upper right')
plt.title("Model performance and feature count")
plt.xlabel("N features")
plt.ylabel("Avg. precision-recall score")
plt.show()

## Final model: robustness

In [None]:
train_scores = list()
test_scores = list()
false_neg_rates = list()
false_pos_rates = list()

# grab top n feature names
indices = np.argsort(importances)[::-1]
top_n_features = feature_names[indices[0:7]]

for i in range(0, 1000):
    # rebuild training and test sets with feature subset
    sampled_data = sample_data(data_moving_avg, data_sample_size)
    targets = sampled_data['ignition']
    data = sampled_data.drop(['ignition'], axis=1)
    data_subset = data[top_n_features]
    x_train, x_test, y_train, y_test = train_test_split(data_subset.values, targets.values)

    # instantiate and train classifier
    catboost_model = CatBoostClassifier(**known_params)
    catboost_model.fit(x_train, y_train)

    predicted_y = catboost_model.predict(x_test)
    
    training_score = average_precision_score(catboost_model.predict(x_train), y_train)
    test_score = average_precision_score(predicted_y, y_test)

    train_scores.append(training_score)
    test_scores.append(test_score)
    
    false_neg_rate, false_pos_rate = calc_false_neg_pos_rate(catboost_model, x_test, y_test)

    false_neg_rates.append(false_neg_rate)
    false_pos_rates.append(false_pos_rate)

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

ax = sns.kdeplot(train_scores, label="Training data", shade=True, color="darkblue")
ax = sns.kdeplot(test_scores, label="Test data", shade=True, color="darkred")
ax.set_title("Precision-recall score distributions")
ax.set(xlabel='Avg. precision-recall score', ylabel='Density')
plt.legend()
plt.legend(loc='upper left')

plt.show()

In [None]:
ax = sns.kdeplot(false_neg_rates, label="False negative", shade=True, color="darkblue")
ax = sns.kdeplot(false_pos_rates, label="False positive", shade=True, color="darkred")
ax.set_title("False positive and negative rates")
ax.set(xlabel='Rate', ylabel='Density')
plt.legend()

plt.show()

In [None]:
plt.scatter(false_neg_rates, false_pos_rates, s=5)
plt.xlabel("False negative rate")
plt.ylabel("False positive rate")
plt.title("False positive vs false negative rate")

plt.show()

## Final results

In [None]:
model_score_comparisions

In [None]:
model_score_history