 In this notebook I train a Random Forest model to predict which class [quarry blast, earthquake, or MIS] an event belongs to based on having 1-12 features computed by Keith and Relu. I did not do any hyperparameter tuning beyond exploring the `class_weight` option. Quickly look into how few features there can be before performance drops.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay

# Load in the data

In [2]:
original_data = pd.read_csv("../data/delivery.2023.03.15.dat", delimiter='\s+')

  original_data = pd.read_csv("../data/delivery.2023.03.15.dat", delimiter='\s+')


In [3]:
original_data.head()

Unnamed: 0,Date,Time,Elat,Elon,Dep,Tag,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12
0,2012-10-01,16:51:31.20,39.141,-111.655,19.5,eq,0.22,X,X,X,X,X,X,X,X,X,X,X
1,2012-10-01,17:02:17.22,40.5,-112.155,-2.0,ex,0.10,X,X,X,X,X,X,X,X,X,X,X
2,2012-10-01,22:38:43.13,40.511,-112.175,-2.0,ex,-0.05,0.52,0.23,-65.4,-88.5,12.87,13.30,X,X,X,0.32,0.62
3,2012-10-02,00:21:13.94,39.709,-113.274,-2.0,ex,X,X,X,X,X,X,X,X,X,X,X,X
4,2012-10-02,18:49:25.22,40.514,-112.161,-2.0,ex,0.00,0.24,0.25,-83.6,-116.6,14.62,12.40,-0.37,0.01,-0.01,0.18,0.44


In [4]:
# Look at the ranges of the values
original_data.describe()

Unnamed: 0,Elat,Elon,Dep
count,8457.0,8457.0,8457.0
mean,39.877596,-111.821029,5.346222
std,1.24663,0.954187,6.406693
min,36.773,-114.093,-3.5
25%,39.41,-112.169,-1.5
50%,40.502,-112.051,6.8
75%,40.751,-111.577,9.4
max,42.499,-108.556,55.8


In [5]:
original_data['Tag'].value_counts()

Tag
eq     5234
ex     1540
mis    1494
gis     189
Name: count, dtype: int64

### Replace "X" with nan for missing features 

In [6]:
data = original_data.copy().replace(to_replace='X', value=np.nan)

### Drop the "gis" tag because I don't want to deal with 4 classes

In [7]:
data = data.drop(np.where(data['Tag'] == 'gis')[0])

In [8]:
data['Tag'].value_counts()

Tag
eq     5234
ex     1540
mis    1494
Name: count, dtype: int64

### Drop rows without any features because that is totally useless

In [9]:
feature_col_names = ['F01', 'F02', 'F03', 'F04',
                    'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12']

In [10]:
# Drop rows missing all features
print(data.shape)
data = data[~np.all(np.isnan(data[feature_col_names].astype(float)), axis=1)]
print(data.shape)

(8268, 18)
(5323, 18)


### Get the features

In [11]:
X_df = data[feature_col_names].astype(float)

In [12]:
X_df.head()

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12
0,0.22,,,,,,,,,,,
1,0.1,,,,,,,,,,,
2,-0.05,0.52,0.23,-65.4,-88.5,12.87,13.3,,,,0.32,0.62
4,0.0,0.24,0.25,-83.6,-116.6,14.62,12.4,-0.37,0.01,-0.01,0.18,0.44
5,0.22,0.65,0.12,-77.6,-73.6,12.11,14.46,,,,,


In [13]:
X_df.describe()

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12
count,3735.0,2401.0,4354.0,2409.0,4377.0,2402.0,4284.0,1565.0,1565.0,1565.0,1803.0,3505.0
mean,-0.101274,0.407926,0.447072,-99.048651,-109.178798,14.896132,14.184463,-0.476863,-0.366773,-0.315297,0.178813,0.262599
std,0.298579,0.487115,0.434165,22.315263,20.60637,1.214644,1.474393,0.175548,0.181528,0.20547,0.22159,0.232294
min,-1.52,-1.52,-1.31,-181.3,-184.0,10.21,10.06,-0.99,-0.99,-0.99,-0.5,-0.41
25%,-0.29,0.13,0.18,-114.2,-123.3,14.05,13.2075,-0.6,-0.47,-0.44,0.01,0.1
50%,-0.07,0.39,0.42,-100.2,-109.5,14.91,14.55,-0.48,-0.37,-0.32,0.16,0.22
75%,0.08,0.65,0.68,-84.5,-95.4,15.87,15.22,-0.38,-0.27,-0.21,0.34,0.4
max,2.55,4.2,4.41,-24.1,-18.1,17.76,17.98,0.47,0.67,0.7,1.03,1.22


### Quickly look at how often each feature is missing

In [14]:
missing_feat_cnts = np.unique(np.where(np.isnan(X_df))[1], return_counts=True)[1]

In [15]:
for i in range(len(feature_col_names)):
    print(feature_col_names[i], missing_feat_cnts[i], missing_feat_cnts[i]/X_df.shape[0])

F01 1588 0.29832801052038327
F02 2922 0.5489385684764231
F03 969 0.18204020289310538
F04 2914 0.5474356565846328
F05 946 0.17771933120420816
F06 2921 0.5487507044899492
F07 1039 0.1951906819462709
F08 3758 0.705992861168514
F09 3758 0.705992861168514
F10 3758 0.705992861168514
F11 3520 0.6612812323877513
F12 1818 0.34153672740935564


In [16]:
np.isnan(X_df).sum(axis=1).describe()

count    5323.000000
mean        5.619200
std         4.037384
min         0.000000
25%         1.000000
50%         7.000000
75%         9.000000
max        11.000000
dtype: float64

### Get the labels

In [17]:
# LabelEncoder will transform the class names to numeric values
le = LabelEncoder()
le.fit(data['Tag'])
print("Original Classes", le.classes_)
print("Mapping", le.transform(le.classes_))
le.classes_ = np.array(['ex', 'eq', 'mis'])
print("Updated Classes", le.classes_)
print("Mapping", le.transform(le.classes_))
y = le.transform(data['Tag'])
y_cnts = np.unique(y, return_counts=True)[1]
print(y_cnts)

Original Classes ['eq' 'ex' 'mis']
Mapping [0 1 2]
Updated Classes ['ex' 'eq' 'mis']
Mapping [0 1 2]
[1267 2956 1100]


# Do a simple 80/20 train/test split

In [18]:
train_inds, test_inds = train_test_split(np.arange(X_df.shape[0]), test_size=0.2, shuffle=True, random_state=42)

In [19]:
X_df = X_df.to_numpy()
X_train = X_df[train_inds]
X_test = X_df[test_inds]
y_train = y[train_inds]
y_test = y[test_inds]

In [20]:
print('Training:', X_train.shape, y_train.shape)
print('Testing:', X_test.shape, y_test.shape)
print("Actual class percentage:", y_cnts/len(y))
print('Training class percentage:', np.unique(y_train, return_counts=True)[1]/len(y_train))
print('Testing class percentage:', np.unique(y_test, return_counts=True)[1]/len(y_test))

Training: (4258, 12) (4258,)
Testing: (1065, 12) (1065,)
Actual class percentage: [0.23802367 0.55532594 0.20665039]
Training class percentage: [0.23485204 0.56012212 0.20502583]
Testing class percentage: [0.25070423 0.53615023 0.21314554]


# Set up a stratified 10-fold cross-validation with 3 repeats 
Doing this for if/when I need to do hyperparmeter tuning

In [21]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234)

### Quickly look at the effect of using `class_weight` = `balanced_subsample`, `balanced`, and `None`
This parameter influences the weight of each class. In `balanced`, the weights are inversely proportional to the class frequency in the training set. In `balanced_subsample` the weights are determined from the bootstrap sample of every tree. None means all classes have the same weight. 

This parameter did not seem to matter much and using `None` actually preformed marginally better. 

I am using F1 score, which is the harmonic mean of precision and recall, to estimate model performance here. `averaging="macro"` means that the F1 score for each of the classes is simply averaged. I chose this because I care about all the classes equally and don't want to give more weight to eq (majority class).

In [23]:
rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced_subsample')
cv_results_balanced_subsample = cross_validate(rf, X_train, y_train, cv=rskf, scoring='f1_macro')

In [24]:
print(cv_results_balanced_subsample['test_score'].mean(), cv_results_balanced_subsample['test_score'].std())

0.8433090992733525 0.01697686096096417


In [25]:
rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced')
cv_results_balanced = cross_validate(rf, X_train, y_train, cv=rskf, scoring='f1_macro')
print(cv_results_balanced['test_score'].mean(), cv_results_balanced['test_score'].std())

0.8416381164529234 0.01714659530123263


### For `class_weight`=`None` try limiting the number of missing features from 0 to 11 (max 12 features)

<= `Missing Features`

In [32]:
for n_missing in range(0, len(feature_col_names)):
    X_train_sub = X_train[(np.isnan(X_train).sum(axis=1) <= n_missing)]
    y_train_sub = y_train[(np.isnan(X_train).sum(axis=1) <= n_missing)]
    rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight=None)
    cv_results_sub = cross_validate(rf, X_train_sub, y_train_sub, cv=rskf, scoring='f1_macro')
    print(f"Missing Features: {n_missing}, N training examples: {X_train_sub.shape[0]}, CV Results: {cv_results_sub['test_score'].mean()}, std-{cv_results_sub['test_score'].std()}")

Missing Features: 0, N training examples: 1069, CV Results: 0.9693398214084149, std-0.025260600137525485
Missing Features: 1, N training examples: 1246, CV Results: 0.9751504985256637, std-0.016645400796831866
Missing Features: 2, N training examples: 1246, CV Results: 0.9751504985256637, std-0.016645400796831866
Missing Features: 3, N training examples: 1397, CV Results: 0.9713061357866059, std-0.016902622038120412
Missing Features: 4, N training examples: 1695, CV Results: 0.9682201736894263, std-0.015971423634079555
Missing Features: 5, N training examples: 1904, CV Results: 0.9629429166212581, std-0.013423360170765036
Missing Features: 6, N training examples: 1930, CV Results: 0.9634164858233466, std-0.012914693674353559
Missing Features: 7, N training examples: 2385, CV Results: 0.9505095738705711, std-0.013774538601664423
Missing Features: 8, N training examples: 3171, CV Results: 0.9255317959576057, std-0.01775088423442337
Missing Features: 9, N training examples: 3543, CV Resul

Try == `Missing Features`?