In [1]:
# load libraries to train a random forest classifier
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from keras.utils import to_categorical
# accuracy score
from sklearn.metrics import accuracy_score
import joblib

In [2]:
path = 'C:/Users/promm/Documents/S2-classification/Data/'

# load train, validation and test data
train = pd.read_csv(os.path.join(path, 'train.csv'))
val = pd.read_csv(os.path.join(path, 'val.csv'))
test = pd.read_csv(os.path.join(path, 'test.csv'))

# filter columns
train = train.filter(regex='b0|ID|specie')
val = val.filter(regex='b0|ID|specie')
test = test.filter(regex='b0|ID|specie')

# concatenate train and validation data
train = pd.concat([train, val], ignore_index=True)

# create a dictionary to convert labels to integers
labels = train['specie'].unique()
labels_dict = dict(zip(labels, range(len(labels))))

# convert labels to integers
train['specie'] = train['specie'].map(labels_dict)
test['specie'] = test['specie'].map(labels_dict)
train_labels = to_categorical(train['specie'])
test_labels_expanded = to_categorical(test['specie'])
test_labels = test['specie']
train = train.drop(['ID','specie'], axis=1)
ids = test['ID']
test = test.drop(['ID','specie'], axis=1)

mpath = 'C:/Users/promm/Documents/S2-classification/rfmodels'

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = 10,10


In [3]:
# create a grid
grid = {'n_estimators': [100, 200, 300],
        'max_depth': [15, 20, 25],
        'min_samples_leaf': [1, 2, 3, 4]}

grid = [[x,y,z] for x in grid['n_estimators'] for y in grid['max_depth'] for z in grid['min_samples_leaf']]


In [4]:
kind = 'band'

for i in range(len(grid)):
    classifier = RandomForestClassifier(n_estimators=grid[i][0], max_depth=grid[i][1], min_samples_leaf=grid[i][2], random_state=123)
    classifier.fit(train, train_labels)
    # predict on validation data
    test_pred = classifier.predict(test)
    # get back the original labels
    test_pred = np.argmax(test_pred, axis=1)
    ac = accuracy_score(test_labels, test_pred)
    print(f'n_stimators: {grid[i][0]}, max_depth: {grid[i][1]}, min_samples_leaf: {grid[i][2]}')
    print(f'Model: {i}.Accuracy: {ac}')
    df = pd.DataFrame({'FID':ids, 'y_pred': test_pred, 'y_obs': test_labels})
    df.to_csv(os.path.join(mpath, f'pred_{kind}_{i}.csv'), index=False)
    # export model
    joblib.dump(classifier, os.path.join(mpath, f'rf_{kind}_{i}.joblib'))

n_stimators: 100, max_depth: 15, min_samples_leaf: 1
Model: 0.Accuracy: 0.8520833333333333
n_stimators: 100, max_depth: 15, min_samples_leaf: 2
Model: 1.Accuracy: 0.8395833333333333
n_stimators: 100, max_depth: 15, min_samples_leaf: 3
Model: 2.Accuracy: 0.825
n_stimators: 100, max_depth: 15, min_samples_leaf: 4
Model: 3.Accuracy: 0.8083333333333333
n_stimators: 100, max_depth: 20, min_samples_leaf: 1
Model: 4.Accuracy: 0.8583333333333333
n_stimators: 100, max_depth: 20, min_samples_leaf: 2
Model: 5.Accuracy: 0.8458333333333333
n_stimators: 100, max_depth: 20, min_samples_leaf: 3
Model: 6.Accuracy: 0.8270833333333333
n_stimators: 100, max_depth: 20, min_samples_leaf: 4
Model: 7.Accuracy: 0.8104166666666667
n_stimators: 100, max_depth: 25, min_samples_leaf: 1
Model: 8.Accuracy: 0.85625
n_stimators: 100, max_depth: 25, min_samples_leaf: 2
Model: 9.Accuracy: 0.8458333333333333
n_stimators: 100, max_depth: 25, min_samples_leaf: 3
Model: 10.Accuracy: 0.8270833333333333
n_stimators: 100, max_