In [32]:
# Imports
import pandas as pd
import numpy as np
from numpy import nan
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
import sklearn
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold

from random import seed
from random import randint

print("Done importing")

Done importing


In [20]:
#Upload file
df=pd.read_csv("data/diagnostictool.csv")
print("Done uploading")

Done uploading


In [5]:
df.head()

Unnamed: 0,ICD10,Description,Symptom1,Symptom2,Symptom3,Symptom4,Symptom5,Symptom6,Symptom7,Symptom8,...,Symptom23,Symptom24,Symptom25,Symptom26,Symptom27,Symptom28,Symptom29,Symptom30,Symptom31,Symptom32
0,A09.0,Other and unspecified gastroenteritis and coli...,Chills,Dehydration,Diarrhea,Dry Mouth,Dry Skin,Fever,Increased Thirst,Malaise,...,,,,,,,,,,
1,A09.9,Gastroenteritis and colitis of unspecified origin,Chills,Dehydration,Diarrhea,Dry Mouth,Dry Skin,Fever,Increased Thirst,Malaise,...,,,,,,,,,,
2,B24,Unspecified human immunodeficiency virus [HIV]...,Chronic Cough,Diarrhea,Difficulty Breathing,Difficulty Swallowing,Fatigue,Fever,Fungal Infection Of The Mouth And Esophagus,Fungal Nails,...,Vomiting,Weight Loss,White Tongue,,,,,,,
3,D50.9,"Iron deficiency anaemia, unspecified",Difficulty Breathing,Dizziness,Fatigue,Palpitations,,,,,...,,,,,,,,,,
4,E03.9,"Hypothyroidism, unspecified",Blurred Vision,Coarse Hair,Cold Intolerance,Constipation,Decreased Hearing,Decreased Sweating,Depression,Dry Hair,...,Thyroid Gland Enlargement,Trouble Concentrating,Weight Gain,,,,,,,


In [2]:
def symptom_to_numeric(x):
    if str(x)=='nan':
        return "0"
    uniqueSymptoms= pd.unique(df[df.columns[2:len(df.columns)]].values.ravel())
    for index, symptom in enumerate(uniqueSymptoms):
        if x==symptom:
            return str(index+1)

In [3]:
def icd_to_numeric(x):
    icd10= pd.unique(df['ICD10'].values.ravel())
    for index, icd in enumerate(icd10):
        if x==icd:
            return str(index)

In [9]:
#Visualize amount of empty variables per columns
df.isnull().mean()

ICD10          0.00
Description    0.00
Symptom1       0.00
Symptom2       0.00
Symptom3       0.02
Symptom4       0.02
Symptom5       0.08
Symptom6       0.12
Symptom7       0.18
Symptom8       0.22
Symptom9       0.24
Symptom10      0.30
Symptom11      0.32
Symptom12      0.40
Symptom13      0.52
Symptom14      0.66
Symptom15      0.72
Symptom16      0.76
Symptom17      0.76
Symptom18      0.80
Symptom19      0.82
Symptom20      0.84
Symptom21      0.84
Symptom22      0.86
Symptom23      0.90
Symptom24      0.92
Symptom25      0.92
Symptom26      0.98
Symptom27      0.98
Symptom28      0.98
Symptom29      0.98
Symptom30      0.98
Symptom31      0.98
Symptom32      0.98
dtype: float64

In [27]:
# Discard columns with more than 50% empty variables
df_reduced= df.loc[:, :'Symptom12']
df_reduced.head()

Unnamed: 0,ICD10,Description,Symptom1,Symptom2,Symptom3,Symptom4,Symptom5,Symptom6,Symptom7,Symptom8,Symptom9,Symptom10,Symptom11,Symptom12
0,A09.0,Other and unspecified gastroenteritis and coli...,Chills,Dehydration,Diarrhea,Dry Mouth,Dry Skin,Fever,Increased Thirst,Malaise,Muscle Pain,Nausea,Stomach Cramps,Stomach Pain
1,A09.9,Gastroenteritis and colitis of unspecified origin,Chills,Dehydration,Diarrhea,Dry Mouth,Dry Skin,Fever,Increased Thirst,Malaise,Muscle Pain,Nausea,Stomach Cramps,Stomach Pain
2,B24,Unspecified human immunodeficiency virus [HIV]...,Chronic Cough,Diarrhea,Difficulty Breathing,Difficulty Swallowing,Fatigue,Fever,Fungal Infection Of The Mouth And Esophagus,Fungal Nails,Groin Swelling,Headache,Malaise,Malignancies
3,D50.9,"Iron deficiency anaemia, unspecified",Difficulty Breathing,Dizziness,Fatigue,Palpitations,,,,,,,,
4,E03.9,"Hypothyroidism, unspecified",Blurred Vision,Coarse Hair,Cold Intolerance,Constipation,Decreased Hearing,Decreased Sweating,Depression,Dry Hair,Dry Skin,Fatigue,Forgetfulness,Hair Loss


In [44]:
#Create dummy variables
dfdum=pd.DataFrame()
for column in df_reduced:
    if column!="ICD10" and column!="Description":
        dfdum[column]= df[column].apply(symptom_to_numeric)
    else:
        dfdum[column]= df[column]


In [45]:
dfdum.pop('Description')
dfdum.head()

Unnamed: 0,ICD10,Symptom1,Symptom2,Symptom3,Symptom4,Symptom5,Symptom6,Symptom7,Symptom8,Symptom9,Symptom10,Symptom11,Symptom12
0,A09.0,1,2,3,4,5,6,7,8,9,10,11,12
1,A09.9,1,2,3,4,5,6,7,8,9,10,11,12
2,B24,15,3,16,17,18,6,19,20,21,22,8,23
3,D50.9,16,36,18,37,0,0,0,0,0,0,0,0
4,E03.9,38,39,40,41,42,43,44,45,5,18,46,47


In [41]:
#Generate random dataset
num_symptoms = len(pd.unique(df[df.columns[2:len(df.columns)]].values.ravel()))
fake_patients=[]
# generate some integers
for n in range (10000):
    patient=[]
    for _ in range(12):
        value = randint(0, num_symptoms)
        patient.append(value)
    fake_patients.append(patient)
fake_patients[:1000]

[[17, 149, 206, 105, 103, 26, 195, 89, 129, 227, 116, 179],
 [5, 167, 21, 20, 193, 199, 182, 42, 151, 25, 86, 222],
 [204, 216, 187, 107, 184, 166, 30, 170, 86, 238, 172, 55],
 [182, 16, 59, 51, 52, 240, 43, 28, 150, 4, 224, 220],
 [165, 230, 20, 133, 93, 148, 127, 161, 32, 221, 234, 89],
 [193, 206, 175, 150, 16, 176, 176, 60, 88, 120, 176, 128],
 [161, 37, 96, 225, 230, 7, 227, 188, 198, 26, 17, 82],
 [142, 74, 198, 30, 182, 234, 187, 133, 35, 34, 129, 128],
 [63, 74, 184, 49, 141, 202, 81, 177, 232, 182, 204, 103],
 [51, 194, 184, 149, 16, 119, 229, 2, 196, 219, 168, 204],
 [101, 247, 60, 162, 117, 101, 244, 78, 16, 212, 85, 160],
 [149, 126, 29, 200, 56, 141, 201, 164, 148, 140, 0, 147],
 [12, 143, 190, 200, 61, 177, 110, 22, 112, 98, 71, 242],
 [68, 50, 94, 141, 230, 216, 211, 83, 37, 199, 39, 223],
 [148, 128, 98, 157, 111, 183, 139, 9, 246, 7, 37, 30],
 [82, 168, 199, 51, 62, 208, 221, 68, 106, 85, 224, 183],
 [140, 34, 62, 91, 25, 185, 1, 61, 57, 227, 236, 194],
 [103, 156, 163

In [46]:
# Assign ICD10 to each fake patient using 1-nn
from sklearn.neighbors import KNeighborsClassifier
        
y = dfdum.pop('ICD10')
X = dfdum
knn = KNeighborsClassifier(n_neighbors=1)  
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [49]:
fake_icd = knn.predict(fake_patients)  
fake_icd[:5]

array(['H66.9', 'J00', 'J22', 'J06.8', 'I50.0'], dtype=object)

In [51]:
# Split-out validation dataset

validation_size = 0.20
seed = 7
scoring='accuracy'
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(fake_patients, fake_icd, test_size=validation_size, random_state=seed)
y_train = [icd_to_numeric(x) for x in y_train]
y_validation=[icd_to_numeric(x) for x in y_validation]


In [53]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)



LR: 0.801000 (0.017409)
LDA: 0.851875 (0.012491)
KNN: 0.613250 (0.014686)
CART: 0.429250 (0.017286)
NB: 0.780500 (0.019049)




SVM: 0.177000 (0.012552)
