# Work Flow

<img src="work-flow.jpg" alt='wrok flow'/>

# Libraries 

In [1]:
import numpy as np
import pandas as pd

# Datasets

In [2]:
# for the label and unlabel datasets
lab_dataset = pd.read_csv('Final_ToBeLabelled(ICD_INCLUDED)_EN.csv', sep='\t')
unlab_dataset = pd.read_csv('Final_Unlabelled_EN.csv', sep='\t')
# split labelled datasets into X and y
X_lab = pd.read_csv('Final_ToBeLabelled(ICD_INCLUDED)_EN.csv', sep='\t').drop(labels=['ICD','index_exam'], axis =1)
y_lab = pd.read_csv('Final_ToBeLabelled(ICD_INCLUDED)_EN.csv', sep='\t')['ICD'].astype('category') # as category 
# split Unlabelled datasets into X and y
X_unlab = pd.read_csv('Final_Unlabelled_EN.csv', sep='\t').drop(labels=['ICD','index_exam'], axis =1)
y_unlab = pd.read_csv('Final_Unlabelled_EN.csv', sep='\t')['ICD']

In [10]:
# Size of the dataset before preprocessing 
lab_dataset.shape, unlab_dataset.shape, X_lab.shape,X_unlab.shape, y_unlab.shape, y_lab.shape

((192, 17), (17174, 17), (192, 15), (17174, 15), (17174,), (192,))

# Preprocessing

In [None]:
# preprocessing on X_train and X_unlab (label encoding)
X_sex = pd.get_dummies(data=X['Weiblich/Männlich'])
X_unlab_sex = pd.get_dummies(data=X_unlab['Weiblich/Männlich'])
X_age = pd.get_dummies(data=X['age'])
X_unlab_age = pd.get_dummies(data=X_unlab['age'])
# preprocessing
# preprocessing on X and X_unlab (label encoding)
X_sex = pd.get_dummies(data=X['Weiblich/Männlich'])
X_unlab_sex = pd.get_dummies(data=X_unlab['Weiblich/Männlich'])
X_age = pd.get_dummies(data=X['age'])
X_unlab_age = pd.get_dummies(data=X_unlab['age'])
# drop the UUID, age and sex columns from the data
X_unlab.drop(labels=['UUID', 'age','Weiblich/Männlich'], axis=1, inplace=True)
X.drop(labels=['age','Weiblich/Männlich', 'UUID'],axis=1,inplace=True)
# after the preprocessing,instantiate call it X_train
X_train = pd.concat([X_sex,X_age,X],axis=1).astype('float')
# after the preprocessing,instantiate call it X_unlab_p
X_unlab_p = pd.concat([X_unlab_sex,X_unlab_age,X_unlab],axis=1).astype('float')
# summarize training set size
print('Labeled Train Set:', X_train.shape, y.shape)
# summarize Unlabeled set size
print('Unlabeled Train Set:', X_unlab_p.shape)

A `supervised` learning algorithm will only have 192 rows from which to train a model.

A `semi-supervised` learning algorithm will have the 17174 labeled rows as well as the 17174 unlabeled rows that could be used in numerous ways to improve the labeled training dataset.

Next, we can establish a baseline in performance on the semi-supervised learning dataset using a supervised learning algorithm fit only on the labeled training data.

This is important because we would expect a semi-supervised learning algorithm to outperform a supervised learning algorithm fit on the labeled data alone. If this is not the case, then the semi-supervised learning algorithm does not have skill.

In this case, we will use a `logistic regression` algorithm fit on the labeled portion of the training dataset.

# Model

In [None]:
# warning ⚠ 
import warnings
warnings.filterwarnings('ignore')
# libraries 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# scaling 
scale = StandardScaler().fit_transform(X_train,y)
# define model
model = LogisticRegression()
# fit model on labeled dataset
model.fit(X_train, y)

The model can then be used to make predictions on the entire `Unlabeled` dataset and evaluated using classification accuracy.

In [None]:
# make predictions on Unlabeled dataset
yhat = model.predict_proba(X_unlab_p)
label = model.predict(X_unlab_p)

In [None]:
yhat[:10]

# Confidence

In [None]:
df = pd.DataFrame(yhat, columns=y[:])
df.head()

In [None]:
# maximum value predicted for each label
maximum_value = df[df.columns].max(axis=1)
maximum_value = pd.DataFrame(data=maximum_value, columns=['Maximum_value'])
# dataframe of the predicted Label
df_l = pd.DataFrame(data=label, columns=['Predicted Labels'])
# DataFrame of the Maximum probabiliy of predicting each label and the Label that was predicted...
df_la_max =pd.concat([df_l,maximum_value,df], axis=1)
df_la_max.sample(frac=.2)

In [None]:
# visualization librarie
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
# distribution of the maximum value
plt.hist(df_la_max['Maximum_value'])
plt.show()

Next, let’s explore how to apply the label propagation algorithm to the dataset.

# Label Propagation for Semi-Supervised Learning

In [None]:
# Import from sklearn
from sklearn.semi_supervised import LabelPropagation
from sklearn.preprocessing import StandardScaler
# scaling 
scale = StandardScaler().fit_transform(X_train,y)
# define model
model_ = LabelPropagation()
# fit model on training dataset
model_.fit(X_train,y)
# make predictions on hold out test set
yhat_ = model_.predict(X_unlab_p)

Label Propagation algorithm in scikit-learn, let apply semi-supervised learning dataset.

First, we must prepare the training dataset.

We can concatenate the input data of the training dataset into a single array.


In [None]:
X_train_label = pd.concat([X_train,y],axis=1)
X_test_unlabel = pd.concat([X_unlab_p,y_unlab], axis=1)
X_train_label.shape, X_test_unlabel.shape

In [None]:
# create the training dataset input
X_train_mixed = pd.concat([X_train_label, X_test_unlabel])
X_train_mixed

We can then create a list of -1 valued (unlabeled) for each row in the unlabeled portion of the training dataset.

In [None]:
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_unlab))]

This list can then be concatenated with the labels from the labeled portion of the training dataset to correspond with the <br>input array for the training dataset.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
# recombine training dataset labels
y_train_mixed = np.concatenate((le.transform(y), nolabel))
y_train_mixed

We can now train the LabelPropagation model on the entire training dataset.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_eval,y_train,y_eval = train_test_split(X_train_mixed.drop(labels='ICD', axis=1),y_train_mixed, random_state=41, test_size=0.5)

In [None]:
# define model
model = LabelPropagation()
# fit model on training dataset
model.fit(X_train,y_train)

Next, we can use the model to make predictions on the holdout dataset <br>
and evaluate the model using classification accuracy.

In [None]:
# get labels for entire training dataset data
tran_labels = model.transduction_