In [1]:
import numpy as np
import pandas as pd

In [2]:
# second data is the data were the ICD codes will be labelled by expert/medical rules. 
# However, we can still use the current ICD until the correct labels have been provided
X_train = pd.read_csv('Final_ToBeLabelled(ICD_INCLUDED)_EN.csv', sep='\t').drop(labels=['ICD','index_exam'], axis =1)
y_train = pd.read_csv('Final_ToBeLabelled(ICD_INCLUDED)_EN.csv', sep='\t')['ICD'].astype('category') # as category 
X_train.shape, y_train.shape

((192, 15), (192,))

In [3]:
# first data is the unlabeled data which we will try to predict the ICD codes
X_unlab = pd.read_csv('Final_Unlabelled_EN.csv', sep='\t').drop(labels=['ICD','index_exam'], axis=1)
X_unlab.shape

(17174, 15)

In [4]:
# preprocessing on X_train and X_unlab (label encoding)
X_train_sex = pd.get_dummies(data=X_train['Weiblich/Männlich'])
X_unlab_sex = pd.get_dummies(data=X_unlab['Weiblich/Männlich'])
X_train_age = pd.get_dummies(data=X_train['age'])
X_unlab_age = pd.get_dummies(data=X_unlab['age'])

In [5]:
# drop the UUID, age and sex columns from the data
X_unlab.drop(labels=['UUID', 'age','Weiblich/Männlich'], axis=1, inplace=True)
X_train.drop(labels=['age','Weiblich/Männlich', 'UUID'],axis=1,inplace=True)

In [6]:
# shape of the train data
X_train.shape,X_train_age.shape,X_train_sex.shape

((192, 12), (192, 8), (192, 2))

In [7]:
# shape of the unlab
X_unlab.shape,X_unlab_sex.shape, X_unlab_age.shape

((17174, 12), (17174, 2), (17174, 8))

In [8]:
X_train.sample(3)

Unnamed: 0,Grübchenabstand DL-DR [mm],Rumpflänge VP-DM [mm],Rumpfneigung VP-DM [°],Lotabweichung VP-DM [°],Seitabweichung VP-DM (RMS) [mm],Oberflächenrotation (RMS) [°],Kyphosewinkel VP-T12 [°],Lordosewinkel T12-DM [°],Beckenneigung (Symmetrielinie) [°],Beckentorsion DL-DR [°],Beckenhochstand [°],Beckenrotation [°]
122,102.0,476.3,7.7,1.9,2.5,3.794401,45.4,39.8,29.8,0.9,-0.3,2.2
24,84.0,416.6,4.8,-1.9,6.1,6.384217,43.8,42.8,29.8,-2.1,-1.3,1.3
32,99.0,489.9,4.7,0.3,4.4,3.970195,53.2,29.4,17.2,1.0,-2.7,-0.2


In [9]:
X_unlab.sample(3)

Unnamed: 0,Grübchenabstand DL-DR [mm],Rumpflänge VP-DM [mm],Rumpfneigung VP-DM [°],Lotabweichung VP-DM [°],Seitabweichung VP-DM (RMS) [mm],Oberflächenrotation (RMS) [°],Kyphosewinkel VP-T12 [°],Lordosewinkel T12-DM [°],Beckenneigung (Symmetrielinie) [°],Beckentorsion DL-DR [°],Beckenhochstand [°],Beckenrotation [°]
17171,92.6,491.2,10.2,-1.3,2.8,1.46761,29.0,2.4,3.1,-0.6,0.2,1.6
8606,87.4,478.0,6.0,-0.8,3.3,5.764056,27.7,19.6,15.1,-2.4,-1.5,6.6
11521,89.7,426.4,4.1,0.3,2.3,4.455964,56.3,29.3,16.2,1.5,-1.8,-3.3


In [10]:
# after the preprocessing,instantiate call it X_train_p
X_train_p = pd.concat([X_train_sex,X_train_age,X_train],axis=1).astype('float')
X_train_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   F                                   192 non-null    float64
 1   M                                   192 non-null    float64
 2   (1, 15]                             192 non-null    float64
 3   (15, 20]                            192 non-null    float64
 4   (20, 30]                            192 non-null    float64
 5   (30, 40]                            192 non-null    float64
 6   (40, 50]                            192 non-null    float64
 7   (50, 60]                            192 non-null    float64
 8   (60, 70]                            192 non-null    float64
 9   (70, 150]                           192 non-null    float64
 10  Grübchenabstand DL-DR [mm]          192 non-null    float64
 11  Rumpflänge VP-DM [mm]               192 non-n

In [11]:
# after the preprocessing,instantiate call it X_unlab_p
X_unlab_p = pd.concat([X_unlab_sex,X_unlab_age,X_unlab],axis=1).astype('float')
X_unlab_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17174 entries, 0 to 17173
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   F                                   17174 non-null  float64
 1   M                                   17174 non-null  float64
 2   (1, 15]                             17174 non-null  float64
 3   (15, 20]                            17174 non-null  float64
 4   (20, 30]                            17174 non-null  float64
 5   (30, 40]                            17174 non-null  float64
 6   (40, 50]                            17174 non-null  float64
 7   (50, 60]                            17174 non-null  float64
 8   (60, 70]                            17174 non-null  float64
 9   (70, 150]                           17174 non-null  float64
 10  Grübchenabstand DL-DR [mm]          17174 non-null  float64
 11  Rumpflänge VP-DM [mm]               17174

In [12]:
# which of the model do you want to start with first, SVM or Graph?
# Using the Transductive Support Machine (S3VM)
from sklearn.svm import SVC
from sklearn.semi_supervised import LabelPropagation, SelfTrainingClassifier
# '''
# Self-training classifier.

# This class allows a given supervised classifier to function as a
# semi-supervised classifier, allowing it to learn from unlabeled data. It
# does this by iteratively predicting pseudo-labels for the unlabeled data
# and adding them to the training set.

# '''

In [13]:
# using the Self Learning Framework
base_model = SVC()
ssmodel = SelfTrainingClassifier(base_estimator=base_model)

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Training on the label sets
ssmodel.fit(X_train_p,y_train)

SelfTrainingClassifier(base_estimator=SVC())

In [16]:
# Score of the label sets
ssmodel.score(X_train_p,y_train)

1.0

In [17]:
# Predict on Unlabeled dataset after the trainning
label_as = ssmodel.predict(X_unlab_p)
label_as[:10]

array(['M99.04', 'M54.90', 'M22.4', 'M92.5', 'M62.08', 'Q65.8', 'M48.06',
       'M19.98', 'M85.89', 'M47.89'], dtype=object)

In [18]:
# Our newly labeled dataset
new_label = pd.concat([X_unlab_p,pd.DataFrame(data=label_as,columns=['ICD'])], axis=1)
new_label.sample(6)

Unnamed: 0,F,M,"(1, 15]","(15, 20]","(20, 30]","(30, 40]","(40, 50]","(50, 60]","(60, 70]","(70, 150]",...,Lotabweichung VP-DM [°],Seitabweichung VP-DM (RMS) [mm],Oberflächenrotation (RMS) [°],Kyphosewinkel VP-T12 [°],Lordosewinkel T12-DM [°],Beckenneigung (Symmetrielinie) [°],Beckentorsion DL-DR [°],Beckenhochstand [°],Beckenrotation [°],ICD
6514,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.2,4.1,4.185096,42.8,26.5,11.5,3.0,-3.0,7.6,M25.5-
5938,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,2.4,7.9,3.137411,49.4,40.3,30.9,-2.0,7.9,4.5,M72.2
9741,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.8,2.7,0.85994,44.7,51.1,38.1,2.6,-5.1,-1.6,M42.96
7811,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.3,3.5,1.846371,49.1,34.2,20.6,2.2,4.5,0.1,Q67.6
13248,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-1.3,13.1,3.540244,55.0,37.3,20.8,1.0,-2.5,5.2,M40.05
6876,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.0,1.4,0.891056,44.4,36.7,18.5,4.1,1.3,-1.3,M21.17


In [19]:
# Real Dataset + the ICD
X_train_new = pd.concat([X_train_p,y_train], axis=1)
X_train_new.sample(6)

Unnamed: 0,F,M,"(1, 15]","(15, 20]","(20, 30]","(30, 40]","(40, 50]","(50, 60]","(60, 70]","(70, 150]",...,Lotabweichung VP-DM [°],Seitabweichung VP-DM (RMS) [mm],Oberflächenrotation (RMS) [°],Kyphosewinkel VP-T12 [°],Lordosewinkel T12-DM [°],Beckenneigung (Symmetrielinie) [°],Beckentorsion DL-DR [°],Beckenhochstand [°],Beckenrotation [°],ICD
54,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,11.1,5.016658,35.7,34.3,34.2,-0.7,0.2,-4.0,M47.26
74,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.4,5.0,3.719479,33.7,33.3,23.9,1.2,4.2,-0.1,M21.61
13,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.6,3.5,2.79748,59.4,52.8,34.3,1.2,2.7,2.3,M23.22
160,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,2.847126,51.1,32.3,17.2,-0.8,0.0,1.8,M40.49
56,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.9,4.8,2.066514,43.7,25.4,11.8,2.5,-2.7,4.3,M94.26
171,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.6,5.9,7.119738,36.6,32.0,13.4,6.5,-10.6,3.1,M99.95


In [20]:
# shapes
X_train_new.shape, new_label.shape

((192, 23), (17174, 23))

In [21]:
# concate [new_label and X_train_p]
df = pd.concat([X_train_new,new_label],ignore_index=True,)
df.sample(3)

Unnamed: 0,F,M,"(1, 15]","(15, 20]","(20, 30]","(30, 40]","(40, 50]","(50, 60]","(60, 70]","(70, 150]",...,Lotabweichung VP-DM [°],Seitabweichung VP-DM (RMS) [mm],Oberflächenrotation (RMS) [°],Kyphosewinkel VP-T12 [°],Lordosewinkel T12-DM [°],Beckenneigung (Symmetrielinie) [°],Beckentorsion DL-DR [°],Beckenhochstand [°],Beckenrotation [°],ICD
8063,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.5,2.7,2.258612,34.6,18.9,8.3,-1.4,2.7,-1.2,M54.90
16147,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.4,7.7,6.053231,38.2,35.1,18.9,3.3,1.2,0.4,M62.51
4429,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.8,4.1,3.69785,41.4,26.7,25.1,-1.2,-2.3,-1.7,M41.20


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17366 entries, 0 to 17365
Data columns (total 23 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   F                                   17366 non-null  float64
 1   M                                   17366 non-null  float64
 2   (1, 15]                             17366 non-null  float64
 3   (15, 20]                            17366 non-null  float64
 4   (20, 30]                            17366 non-null  float64
 5   (30, 40]                            17366 non-null  float64
 6   (40, 50]                            17366 non-null  float64
 7   (50, 60]                            17366 non-null  float64
 8   (60, 70]                            17366 non-null  float64
 9   (70, 150]                           17366 non-null  float64
 10  Grübchenabstand DL-DR [mm]          17366 non-null  float64
 11  Rumpflänge VP-DM [mm]               17366

In [23]:
# all the datasets
df.shape

(17366, 23)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [25]:
X = df.drop(labels='ICD', axis=1)
y = df['ICD']
# split our datasets and stratify on y (ICD)
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, stratify=y)
# shape of our splited datasets
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((12156, 22), (12156,), (5210, 22), (5210,))

In [26]:
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe = Pipeline([('scale',StandardScaler()),('model',SVC())])

In [27]:
# Training the data
pipe.fit(X_train,y_train)

Pipeline(steps=[('scale', StandardScaler()), ('model', SVC())])

In [28]:
# performance 
pipe.score(X_test,y_test)

0.7420345489443378

In [29]:
# Predict on the unseen dataset
y_pred = pipe.predict(X_test)

In [30]:
y_pred[:10]

array(['M47.84', 'M51.9', 'M92.5', 'Q72.9', 'M20.1', 'M25.56', 'M62.91',
       'M19.95', 'M25.51', 'M76.6'], dtype=object)

In [31]:
# Evaluation of our trained model
from sklearn.metrics import accuracy_score, confusion_matrix

In [32]:
df.columns

Index(['F', 'M', '(1, 15]', '(15, 20]', '(20, 30]', '(30, 40]', '(40, 50]',
       '(50, 60]', '(60, 70]', '(70, 150]', 'Grübchenabstand DL-DR [mm]',
       'Rumpflänge VP-DM [mm]', 'Rumpfneigung VP-DM [°]',
       'Lotabweichung VP-DM [°]', 'Seitabweichung VP-DM (RMS) [mm]',
       'Oberflächenrotation (RMS) [°]', 'Kyphosewinkel VP-T12 [°]',
       'Lordosewinkel T12-DM [°]', 'Beckenneigung (Symmetrielinie) [°]',
       'Beckentorsion DL-DR [°]', 'Beckenhochstand [°]', 'Beckenrotation [°]',
       'ICD'],
      dtype='object')

In [33]:
cm = confusion_matrix(y_test, y_pred, labels=df.ICD)
cm

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 34,  0,  0],
       [ 0,  0,  0, ...,  0, 44,  0],
       [ 0,  0,  0, ...,  0,  0,  8]], dtype=int64)

In [34]:
# accuracy score
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)* 100}')

Accuracy Score: 74.20345489443379


In [35]:
# import visualization lib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

In [36]:
# plot the confusiom metrix
# plt.figure(figsize = (20,15))
# plot_confusion_matrix(pipe,X_test,y_test)
# plt.show()
# sns.heatmap(cm, annot=True)