## Get all imports

In [1]:
import math
import numpy as np
import pandas as pd
from collections import Counter
from PyImpetus import PPIMBR
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import time

## Load data

In [2]:
# Load the data into a dataframe
df = pd.read_csv("slice_localization_data.csv")
# Pre-process the data
df = df.drop(["patientId"], axis=1)
# Lets check it out
display(df.head())
# Split the data into input features and target variable
data, Y = df.drop(["reference"], axis=1), df["reference"].values
# Lets check out the shape of our data
print("Data shape: ", data.shape, "Target Variable shape: ", Y.shape)

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.980381,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.803851
1,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.745726
2,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.6876
3,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.977008,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.629474
4,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,...,-0.25,0.976833,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,21.571348


Data shape:  (53500, 384) Target Variable shape:  (53500,)


## Modelling with Decision Tree without PyImpetus

In [3]:
# Use Train-test split for understanding the performance of PyImpetus
x_train, x_test, y_train, y_test = train_test_split(data, Y, test_size=0.2, random_state=27, shuffle=True)
# We want to time our algorithm
start = time.time()

# Convert the data into numpy arrays
x_train, x_test = x_train.values, x_test.values

model = DecisionTreeRegressor(random_state=27)
model.fit(x_train, y_train)
preds = model.predict(x_test)
score = mean_squared_error(y_test, preds)
print("MSE: ", score)
end = time.time()
print("Time Required (in seconds): ", end-start)

MSE:  6.5462149646453085
Time Required (in seconds):  5.878587245941162


## Modelling with Decision Tree using PyImpetus (fast but less robust feature subset selection)

In [4]:
# Use Train-test split for understanding the performance of PyImpetus
x_train, x_test, y_train, y_test = train_test_split(data, Y, test_size=0.2, random_state=27, shuffle=True)
# We want to time our algorithm
start = time.time()

# Create a PyImpetus classification object and initialize with required parameters
model = PPIMBR(DecisionTreeRegressor(random_state=27), cv=0, p_val_thresh=0.001, num_simul=10, simul_size=0.2, sig_test_type="non-parametric", random_state=27, verbose=2)
# Fit this above object on the train part and transform the train dataset into selected feature subset
# NOTE: x_train has to be a dataframe and y_train has to be a numpy array
x_train = model.fit_transform(x_train, y_train)
# Transform the test set as well
# NOTE: x_test has to be a dataframe
x_test = model.transform(x_test)
# Check out the number of features selected
print("Number of features selected: ", len(model.MB))
# Convert the data into numpy arrays
x_train, x_test = x_train.values, x_test.values

model = DecisionTreeRegressor(random_state=27)
model.fit(x_train, y_train)
preds = model.predict(x_test)
score = mean_squared_error(y_test, preds)
print("Score: ", score)
end = time.time()
print("Time Required (in seconds): ", end-start)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:   15.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.7s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.7s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.4s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elaps

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.6s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.5s remaining:   11.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   12.2s remaining:   12.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.5s remaining:   11.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   12.1s remaining:   12.1s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.5s remaining:   11.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.1s remaining:   11.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.3s remaining:   11.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.0s remaining:   11.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.0s remaining:   11.0s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.1s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.1s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.1s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.0s remaining:   10.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.1s remaining:   10.1s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.3s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.7s remaining:   10.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.1s remaining:   11.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.6s remaining:   10.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.2s remaining:   10.2s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.7s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.3s remaining:   11.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.3s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.5s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.5s remaining:   10.5s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.8s remaining:   10.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   11.2s remaining:   11.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.7s remaining:   10.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.5s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.8s remaining:   10.8s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.5s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.5s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.4s remaining:   10.4s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.2s remaining:    9.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.2s remaining:    9.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.2s remaining:    9.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.2s remaining:    9.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.2s remaining:    9.2s
[Pa

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.0s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.9s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    9.1s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.9s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    8.9s remaining:    8.9s
[Pa

Number of features selected:  259
Score:  5.695077871092331
Time Required (in seconds):  4113.647856712341


# Final Results (MSE is used so less is better)

### Final MSE using all features: 6.54 (5.87 seconds)
### Final MSE using PyImpetus recommended features: 5.69 (4113.65 seconds)