In [33]:
import argparse
import torch
import pandas as pd
import numpy as np
from torch import nn, optim
from skorch import NeuralNetRegressor
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [34]:
# Load data

# load
df = pd.read_csv("../data/satellite_10000.csv")
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df=df.dropna()

# get input variables
X = df[df.columns[0:-4]]

# get output variable
density = df['density_log10'] # log based, use 'density' if want density value instead of logged.
perturbation = df['perturbation']
norm_perturbation = df['perturbation_norm']

# transform
xscaler = preprocessing.MinMaxScaler()
names = X.columns
d = xscaler.fit_transform(X)
X = pd.DataFrame(d, columns=names)

yscaler = preprocessing.MinMaxScaler()
d = yscaler.fit_transform(perturbation.values.reshape(-1, 1))
y = pd.DataFrame(d, columns=['norm_perturbation'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 628)
# Convert to 2D PyTorch tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

In [38]:
X_train.shape

torch.Size([1089, 126])

In [14]:
# Check Device

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [39]:
# define neural network model
model = nn.Sequential(
    nn.Linear(126, 10),
    nn.ReLU(),
    nn.Linear(10, 20),
    nn.ReLU(),
    nn.Linear(20, 1)
)

# create skorch wrapper for a regressor.
netRegressor = NeuralNetRegressor(
    module=model,
    criterion=nn.MSELoss,
    optimizer=optim.Adam,
    max_epochs=32,
    batch_size=128,
    device=device
)

# train
netRegressor.fit(X_train, y_train)

# get importance
r = permutation_importance(netRegressor, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

feature_importances = r.importances_mean

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.0166[0m        [32m0.0114[0m  0.5059
      2        [36m0.0116[0m        [32m0.0094[0m  0.0176
      3        [36m0.0108[0m        0.0095  0.0174
      4        [36m0.0103[0m        0.0095  0.0172
      5        0.0105        [32m0.0093[0m  0.0168
      6        0.0104        0.0093  0.0168
      7        0.0103        0.0093  0.0167
      8        0.0104        [32m0.0093[0m  0.0167
      9        0.0104        [32m0.0093[0m  0.0167
     10        0.0104        0.0093  0.0167
     11        0.0104        0.0093  0.0167
     12        0.0103        0.0093  0.0167
     13        0.0104        [32m0.0093[0m  0.0168
     14        0.0104        0.0093  0.0169
     15        0.0104        0.0093  0.0168
     16        0.0104        0.0093  0.0168
     17        0.0104        0.0093  0.0168
     18        0.0104        0.0093  0.0168
     19        0.0104        0

In [47]:
for i in r.importances_mean.argsort()[::-1]:
     # if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
     print(f"{names[i]:<8}"
           f"{r.importances_mean[i]:.6f}"
           f" +/- {r.importances_std[i]:.3f}")

ae\_index-lagged-by-195-minutes0.000000 +/- 0.000
sym\_h-lagged-by-300-minutes0.000000 +/- 0.000
ae\_index-lagged-by-210-minutes0.000000 +/- 0.000
ae\_index-lagged-by-140-minutes0.000000 +/- 0.000
ae\_index-lagged-by-145-minutes0.000000 +/- 0.000
ae\_index-lagged-by-150-minutes0.000000 +/- 0.000
ae\_index-lagged-by-160-minutes0.000000 +/- 0.000
ae\_index-lagged-by-165-minutes0.000000 +/- 0.000
ae\_index-lagged-by-170-minutes0.000000 +/- 0.000
ae\_index-lagged-by-175-minutes0.000000 +/- 0.000
ae\_index-lagged-by-180-minutes0.000000 +/- 0.000
ae\_index-lagged-by-185-minutes0.000000 +/- 0.000
ae\_index-lagged-by-190-minutes0.000000 +/- 0.000
ae\_index-lagged-by-200-minutes0.000000 +/- 0.000
ae\_index-lagged-by-205-minutes0.000000 +/- 0.000
ae\_index-lagged-by-215-minutes0.000000 +/- 0.000
sym\_h-lagged-by-295-minutes0.000000 +/- 0.000
ae\_index-lagged-by-220-minutes0.000000 +/- 0.000
ae\_index-lagged-by-225-minutes0.000000 +/- 0.000
ae\_index-lagged-by-230-minutes0.000000 +/- 0.000
ae\_in

In [23]:
# define random forest model
rfRegressor = RandomForestRegressor()

# train
rfRegressor.fit(X_train, y_train.ravel())

# score
y_out = rfRegressor.predict(X_test)
# print(f'''r2 score for random forest is: {r2_score(y_test, y_out)}''')

# get importances
feature_importances = rfRegressor.feature_importances_

[6.86773974e-03 1.58096322e-02 1.63227066e-02 6.35724386e-01
 5.22448644e-03 5.17335427e-03 3.18368225e-03 4.38315800e-03
 2.18343205e-03 2.55743147e-03 2.74374563e-03 2.42364234e-03
 2.09539057e-03 1.27841300e-03 1.89379215e-03 1.64411768e-03
 1.12922586e-03 2.51947997e-03 2.30795635e-03 1.84593094e-03
 1.42686977e-03 2.19202638e-03 1.42492493e-03 1.59480394e-03
 1.32805788e-03 5.38372989e-03 7.56772097e-03 3.29549926e-03
 9.31213392e-03 6.48065948e-03 4.00013944e-03 2.22060441e-03
 4.50784157e-03 3.25509928e-03 2.65766774e-03 2.09155316e-03
 2.67499146e-03 3.87496868e-03 3.28464859e-03 4.35360400e-03
 1.06438257e-03 3.18097927e-03 2.42577508e-03 2.80418745e-03
 2.30456302e-03 4.78136321e-03 4.36412804e-03 3.91318715e-03
 4.37585018e-03 3.20006287e-03 2.34762080e-03 8.57914231e-03
 5.38472325e-03 8.16800523e-03 2.89713763e-03 2.62467051e-03
 8.15401293e-03 4.00448362e-03 4.16084787e-03 3.47495654e-03
 2.59999285e-03 5.24627175e-03 4.27407124e-03 2.92218304e-03
 4.44783944e-03 2.247702

In [43]:
feature_importances = rfRegressor.feature_importances_

In [44]:
# save to file
pd.DataFrame({'features': names, 'importances': feature_importances}).to_csv('../data/featureRank.csv', index=False)