## CAT Scan Location Prediction

Given *data about various CAT scans*, let's try to predict the **location on the body** from where a given scan was taken.

We will use a TensorFlow/Keras neural network to make our predictions.

Data source: https://www.kaggle.com/datasets/uciml/ct-slice-localization

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
data = pd.read_csv('archive/slice_localization_data.csv')
data

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.980381,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.803851
1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.745726
2,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.687600
3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.629474
4,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.976833,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.571348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53495,96,0.591906,0.357764,0.000000,0.000000,0.552321,0.795304,0.946697,0.952227,0.84395,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,29.290398
53496,96,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53497,96,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53498,96,0.634921,0.904555,0.956087,0.980208,0.157664,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.000000,0.0,0.994967,0.806688,0.0,0.0,-0.25,-0.25,14.582997


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53500 entries, 0 to 53499
Columns: 386 entries, patientId to reference
dtypes: float64(385), int64(1)
memory usage: 157.6 MB


### Preprocessing

In [4]:
df = data.copy()

In [5]:
# Drop patientId column
df = df.drop('patientId', axis=1)

In [6]:
df

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.980381,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.803851
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.745726
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.687600
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.629474
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.976833,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.571348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53495,0.591906,0.357764,0.000000,0.000000,0.552321,0.795304,0.946697,0.952227,0.84395,0.798303,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,29.290398
53496,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,0.868504,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53497,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,0.868504,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53498,0.634921,0.904555,0.956087,0.980208,0.157664,0.000000,-0.250000,-0.250000,-0.25000,-0.250000,...,-0.25,0.000000,0.0,0.994967,0.806688,0.0,0.0,-0.25,-0.25,14.582997


In [7]:
# Split df into X and y
y = df['reference'].copy()
X = df.drop('reference', axis=1).copy()

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [9]:
X_train

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value374,value375,value376,value377,value378,value379,value380,value381,value382,value383
36958,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,-0.250000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.998727,0.999059,0.00,-0.25
20128,0.000000,0.000000,0.709849,0.887755,0.000000,0.000000,0.000000,0.00,0.000000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00,0.00
33249,0.097561,0.101777,0.000000,0.000000,0.229446,0.000000,0.000000,0.00,0.971077,-0.25,...,0.00,0.00,0.860490,0.998998,0.999965,0.99984,0.000000,0.000000,0.00,0.00
33896,0.000000,0.015177,0.000000,0.023474,0.000000,0.000000,0.000000,0.00,0.000000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00,0.00
15142,0.000000,0.097826,0.870466,0.893671,0.000000,0.000000,0.000000,-0.25,-0.250000,-0.25,...,0.00,-0.25,0.992206,0.000000,0.000000,0.00000,0.000000,0.000000,-0.25,-0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,0.000000,0.000000,0.000000,0.000000,0.869092,0.890979,0.000000,0.00,0.000000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00,0.00
32511,0.000000,0.000000,0.803598,0.000000,0.000000,-0.250000,-0.250000,-0.25,-0.250000,-0.25,...,-0.25,-0.25,0.000000,0.000000,0.000000,0.00000,0.000000,-0.250000,-0.25,-0.25
5192,0.000000,0.000000,0.000000,0.000000,0.594512,0.566795,0.185011,0.00,-0.250000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00,-0.25
12172,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,-0.25,...,0.00,0.00,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00,-0.25


In [10]:
y_train

36958    53.168322
20128    78.796527
33249    34.740313
33896    29.947560
15142    15.880591
           ...    
50057    74.919418
32511    23.360237
5192     63.332494
12172    68.634920
33003    55.135923
Name: reference, Length: 37450, dtype: float64

In [12]:
X_train.describe()

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value374,value375,value376,value377,value378,value379,value380,value381,value382,value383
count,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,...,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0,37450.0
mean,0.05919,0.071638,0.147057,0.218527,0.273778,0.275567,0.2056,0.062025,-0.043025,-0.231485,...,0.004943,-0.029206,0.183695,0.320226,0.35927,0.34418,0.265875,0.0848,-0.031221,-0.155017
std,0.173515,0.196934,0.301534,0.359447,0.378443,0.369504,0.352174,0.291662,0.267587,0.100615,...,0.075429,0.085841,0.383887,0.463548,0.478132,0.472092,0.437623,0.28243,0.097597,0.1225
min,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25,-0.25,-0.25,...,-0.25,-0.25,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,-0.25
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25
75%,0.0,0.0,0.0,0.443693,0.679601,0.6595,0.44984,0.0,0.0,-0.25,...,0.0,0.0,0.0,0.996216,0.999686,0.999563,0.951105,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,0.998635,0.996468,0.999334,1.0,1.0,1.0,...,0.998254,0.953475,1.0,1.0,1.0,1.0,1.0,0.999848,0.996839,0.942851


### Training Without Scaling 

In [13]:
X_train.shape

(37450, 384)

In [14]:
inputs = tf.keras.Input(shape=(384,))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer = 'adam',
    loss = 'mse'
)

history = model.fit(
    X_train,
    y_train,
    validation_split = 0.2,
    batch_size = 32,
    epochs = 100,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience = 3,
            restore_best_weights = True
        )
    ]
)

Epoch 1/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 125.8489 - val_loss: 16.7595
Epoch 2/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 11.1463 - val_loss: 8.1933
Epoch 3/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 5.2598 - val_loss: 4.3623
Epoch 4/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 3.2072 - val_loss: 2.9819
Epoch 5/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 2.3425 - val_loss: 2.5485
Epoch 6/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.8907 - val_loss: 2.0317
Epoch 7/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.5827 - val_loss: 2.0354
Epoch 8/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.4211 - val_loss: 1.5591
Epoch 9/100
[1m937/937[0m 

### Results 

In [20]:
y_pred = np.squeeze(model.predict(X_test))
y_pred

[1m502/502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


array([40.264927, 27.449657, 24.210627, ..., 28.329725, 85.55096 ,
       34.78535 ], shape=(16050,), dtype=float32)

In [21]:
y_test.shape, y_pred.shape

((16050,), (16050,))

In [23]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("RMSE: {:.5f}".format(rmse))

RMSE: 1.23825


In [27]:
np.sum((y_test - y_test.mean())**2)

np.float64(8022031.62708539)

In [28]:
np.sum((y_test - y_pred)**2)

np.float64(24608.911201574076)

In [30]:
r2_score = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
r2_score

np.float64(0.9969323343081216)

In [31]:
print("R^2 Score: {:.5f}".format(r2_score))

R^2 Score: 0.99693


### Training with Scaling

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

In [36]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [37]:
X_train

Unnamed: 0,value0,value1,value2,value3,value4,value5,value6,value7,value8,value9,...,value374,value375,value376,value377,value378,value379,value380,value381,value382,value383
36958,-0.341127,-0.363771,-0.487703,-0.607962,-0.723441,-0.745784,-0.583810,-0.212663,-0.773497,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,1.674640,3.237156,0.319898,-0.775380
20128,-0.341127,-0.363771,1.866456,1.861848,-0.723441,-0.745784,-0.583810,-0.212663,0.160791,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,0.319898,1.265458
33249,0.221142,0.153044,-0.487703,-0.607962,-0.117145,-0.745784,-0.583810,-0.212663,3.789852,-0.18402,...,-0.065533,0.340238,1.763031,1.464318,1.340015,1.388859,-0.607551,-0.300254,0.319898,1.265458
33896,-0.341127,-0.286703,-0.487703,-0.542655,-0.723441,-0.745784,-0.583810,-0.212663,0.160791,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,0.319898,1.265458
15142,-0.341127,0.132981,2.399129,1.878307,-0.723441,-0.745784,-0.583810,-1.069832,-0.773497,-0.18402,...,-0.065533,-2.572154,2.106148,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,-2.241680,-0.775380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,-0.341127,-0.363771,-0.487703,-0.607962,1.573080,1.665532,-0.583810,-0.212663,0.160791,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,0.319898,1.265458
32511,-0.341127,-0.363771,2.177367,-0.607962,-0.723441,-1.422376,-1.293695,-1.069832,-0.773497,-0.18402,...,-3.379955,-2.572154,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-1.185440,-2.241680,-0.775380
5192,-0.341127,-0.363771,-0.487703,-0.607962,0.847520,0.788171,-0.058464,-0.212663,-0.773497,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,0.319898,-0.775380
12172,-0.341127,-0.363771,-0.487703,-0.607962,-0.723441,-0.745784,-0.583810,-0.212663,0.160791,-0.18402,...,-0.065533,0.340238,-0.478520,-0.690825,-0.751413,-0.729063,-0.607551,-0.300254,0.319898,-0.775380


In [38]:
inputs = tf.keras.Input(shape=(384,))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer = 'adam',
    loss = 'mse'
)

history = model.fit(
    X_train,
    y_train,
    validation_split = 0.2,
    batch_size = 32,
    epochs = 100,
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience = 3,
            restore_best_weights = True
        )
    ]
)

Epoch 1/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 95.7393 - val_loss: 17.0509
Epoch 2/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 12.5082 - val_loss: 10.0927
Epoch 3/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 6.5872 - val_loss: 6.0457
Epoch 4/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 4.1482 - val_loss: 3.3656
Epoch 5/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2.9187 - val_loss: 6.7629
Epoch 6/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 2.6659 - val_loss: 2.4975
Epoch 7/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 2.1091 - val_loss: 2.1895
Epoch 8/100
[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 1.7932 - val_loss: 3.9018
Epoch 9/100
[1m937/937[0m 

### Results

In [39]:
y_pred = np.squeeze(model.predict(X_test))
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("RMSE: {:.5f}".format(rmse))

r2_score = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
print("R^2 Score: {:.5f}".format(r2_score))

[1m502/502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
RMSE: 0.90825
R^2 Score: 0.99835
