In [1]:
from mt_eligibility.load import load_and_combine_csvs
from mt_eligibility.model import train_xgboost, predict_xgboost, get_feature_importances

# Load all data

In [2]:
csv = load_and_combine_csvs(["master", "data_split"])
csv

Unnamed: 0,ID,Age,Sex,NIHSS,LKWT,qER Infarct Decision,qER Infarct Volume,qER ASPECTS,LVO,MT,Dataset
0,36aa9c237f40b8a3c78b0fc2d0efabc0,64,Male,14,0.900000,1,148.081945,1,False,False,train
1,0846f0aa92510261fc9a43753c9200e9,78,Female,5,4.550000,0,0.000000,10,True,False,train
2,a7d5017833ac10e3ce5e0c4200a10bb8,36,Female,6,3.033333,0,0.000000,10,True,True,train
3,87466cdef2e6832055c41559bcb059c4,105,Female,18,13.633333,0,0.000000,10,True,False,train
4,d8ee8882764de37287536521e6861577,93,Male,19,6.666667,0,0.000000,10,True,True,train
...,...,...,...,...,...,...,...,...,...,...,...
255,d8177fd274e417a8521bb79dd949366d,78,Female,2,3.283333,0,0.000000,10,True,False,test
256,a591d9ae886c57e2fb2c5f1d578dd2d4,55,Male,7,11.866667,1,0.296204,10,True,True,test
257,0dad470600919408733d8e577b4a48bd,66,Male,8,5.150000,1,0.000000,10,True,True,test
258,026e6cd6a8c8812ad83e8a5ffb8cdbb1,74,Female,15,2.600000,0,0.000000,10,True,True,test


# Split into train and val

In [3]:
train = csv[csv["Dataset"] == "train"].copy()
test = csv[csv["Dataset"] == "test"].copy()

# Train and evaluate the model1

In [4]:
train_x = train[["qER Infarct Volume", "qER ASPECTS", "Age", "LKWT", "NIHSS"]]
train_y = train["MT"]

In [5]:
train_x

Unnamed: 0,qER Infarct Volume,qER ASPECTS,Age,LKWT,NIHSS
0,148.081945,1,64,0.900000,14
1,0.000000,10,78,4.550000,5
2,0.000000,10,36,3.033333,6
3,0.000000,10,105,13.633333,18
4,0.000000,10,93,6.666667,19
...,...,...,...,...,...
125,0.000000,10,88,1.700000,8
126,0.000000,10,68,2.666667,14
127,0.000000,10,69,4.033333,16
128,0.000000,10,86,7.116667,6


In [6]:
model1 = train_xgboost(train_x, train_y, learning_rate=0.15, n_estimators=10)
model1

In [7]:
test_x = test[["qER Infarct Volume", "qER ASPECTS", "Age", "LKWT", "NIHSS"]]
test_y = test["MT"]

In [8]:
test["MT Prediction Score"], test["MT Prediction"] = predict_xgboost(model1, test_x, threshold=0.5)

In [9]:
test.loc[(test["Dataset"] == "test") & (test["MT"] == True) & (test["MT Prediction"] == 1), "Prediction Category"] = (
    "TP"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == True) & (test["MT Prediction"] == 0), "Prediction Category"] = (
    "FN"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == False) & (test["MT Prediction"] == 1), "Prediction Category"] = (
    "FP"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == False) & (test["MT Prediction"] == 0), "Prediction Category"] = (
    "TN"
)
confusion_matrix = test["Prediction Category"].value_counts()
confusion_matrix

Prediction Category
TP    68
TN    30
FP    20
FN    12
Name: count, dtype: int64

In [10]:
sensitivity = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"])
specificity = confusion_matrix["TN"] / (confusion_matrix["TN"] + confusion_matrix["FP"])
sensitivity, specificity

(np.float64(0.85), np.float64(0.6))

In [11]:
get_feature_importances(model1, train_x)

Unnamed: 0,Feature,Importance
0,qER Infarct Volume,0.214091
1,qER ASPECTS,0.143464
2,Age,0.104135
3,LKWT,0.108094
4,NIHSS,0.430216


# Train and evaluate the model2

In [12]:
train_x = train[["qER Infarct Volume", "qER ASPECTS", "Age", "LKWT", "NIHSS", "LVO"]]
train_y = train["MT"]

In [13]:
train_x

Unnamed: 0,qER Infarct Volume,qER ASPECTS,Age,LKWT,NIHSS,LVO
0,148.081945,1,64,0.900000,14,False
1,0.000000,10,78,4.550000,5,True
2,0.000000,10,36,3.033333,6,True
3,0.000000,10,105,13.633333,18,True
4,0.000000,10,93,6.666667,19,True
...,...,...,...,...,...,...
125,0.000000,10,88,1.700000,8,True
126,0.000000,10,68,2.666667,14,True
127,0.000000,10,69,4.033333,16,True
128,0.000000,10,86,7.116667,6,True


In [14]:
model2 = train_xgboost(train_x, train_y, learning_rate=0.10, n_estimators=12)
model2

In [15]:
test_x = test[["qER Infarct Volume", "qER ASPECTS", "Age", "LKWT", "NIHSS", "LVO"]]
test_y = test["MT"]

In [16]:
test["MT Prediction Score"], test["MT Prediction"] = predict_xgboost(model2, test_x, threshold=0.6033)

In [17]:
test.loc[(test["Dataset"] == "test") & (test["MT"] == True) & (test["MT Prediction"] == 1), "Prediction Category"] = (
    "TP"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == True) & (test["MT Prediction"] == 0), "Prediction Category"] = (
    "FN"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == False) & (test["MT Prediction"] == 1), "Prediction Category"] = (
    "FP"
)
test.loc[(test["Dataset"] == "test") & (test["MT"] == False) & (test["MT Prediction"] == 0), "Prediction Category"] = (
    "TN"
)
confusion_matrix = test["Prediction Category"].value_counts()
confusion_matrix

Prediction Category
TP    66
TN    41
FN    14
FP     9
Name: count, dtype: int64

In [18]:
sensitivity = confusion_matrix["TP"] / (confusion_matrix["TP"] + confusion_matrix["FN"])
specificity = confusion_matrix["TN"] / (confusion_matrix["TN"] + confusion_matrix["FP"])
sensitivity, specificity

(np.float64(0.825), np.float64(0.82))

In [19]:
get_feature_importances(model2, train_x)

Unnamed: 0,Feature,Importance
0,qER Infarct Volume,0.084736
1,qER ASPECTS,0.04799
2,Age,0.018743
3,LKWT,0.038195
4,NIHSS,0.116534
5,LVO,0.693802
