# KNN - BMD Dataset

In [51]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Importing CSV

In [52]:
bmd = pd.read_csv('../../bmd/bmd.csv')

In [53]:
bmd.head()

Unnamed: 0,id,age,sex,fracture,weight_kg,height_cm,medication,waiting_time,bmd
0,469,57.052768,F,no fracture,64.0,155.5,Anticonvulsant,18,0.8793
1,8724,75.741225,F,no fracture,78.0,162.0,No medication,56,0.7946
2,6736,70.7789,M,no fracture,73.0,170.5,No medication,10,0.9067
3,24180,78.247175,F,no fracture,60.0,148.0,No medication,14,0.7112
4,17072,54.191877,M,no fracture,55.0,161.0,No medication,20,0.7909


# Analyzing numeric columns

In [54]:
bmd.describe()

Unnamed: 0,id,age,weight_kg,height_cm,waiting_time,bmd
count,169.0,169.0,169.0,169.0,169.0,169.0
mean,9102.556213,63.631531,64.66568,160.254438,19.739645,0.783104
std,8744.623598,12.356936,11.537171,7.928272,15.80057,0.166529
min,35.0,35.814058,36.0,142.0,5.0,0.4076
25%,2018.0,54.424211,56.0,154.0,9.0,0.6708
50%,6702.0,63.487837,64.5,160.5,14.0,0.7861
75%,17100.0,72.080558,73.0,166.0,24.0,0.8888
max,24208.0,88.753795,96.0,177.0,96.0,1.3624


In [55]:
bmd.groupby('fracture').mean()

# id: To be removed
# Age: The oldest you get, the biggest the probability to fracture.
# Weight: A higher weight helps preventing a fracture.
# Height: It does not really impact.
# Waiting time (time the patient had to wait for the densitometry (in minutes): the longest the time, the more probability to fracture.
# Bdm (bone mineral density): The most dense your bones ares, the less likely they are to fracture.

Unnamed: 0_level_0,id,age,weight_kg,height_cm,waiting_time,bmd
fracture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fracture,7869.04,69.77132,58.41,159.29,23.3,0.623308
no fracture,9620.840336,61.051787,67.294118,160.659664,18.243697,0.850245


# Analyzing string columns

In [56]:
cross_sex = pd.crosstab(bmd['fracture'], bmd['sex'])
cross_sex

# There is not a lot of difference between Men and Women

sex,F,M
fracture,Unnamed: 1_level_1,Unnamed: 2_level_1
fracture,25,25
no fracture,58,61


In [57]:
Female_Fracture = cross_sex['F']['fracture'] / (cross_sex['F']['fracture']  + cross_sex['F']['no fracture'])
Female_Fracture

0.30120481927710846

In [58]:
Male_Fracture = cross_sex['M']['fracture'] / (cross_sex['M']['fracture']  + cross_sex['M']['no fracture'])
Male_Fracture

0.29069767441860467

In [59]:
cross_med = pd.crosstab(bmd['fracture'], bmd['medication'])
cross_med

# Medication prevents fractures, especially for Gluco. For Anticonvulsant there are not many cases.

medication,Anticonvulsant,Glucocorticoids,No medication
fracture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fracture,4,5,41
no fracture,5,19,95


In [60]:
Anticonvulsant_fracture = cross_med['Anticonvulsant']['fracture'] / (cross_med['Anticonvulsant']['fracture']  
+ cross_med['Anticonvulsant']['no fracture'])

Anticonvulsant_fracture

0.4444444444444444

In [61]:
Glucocorticoids_fracture = cross_med['Glucocorticoids']['fracture'] / (cross_med['Glucocorticoids']['fracture']  
+ cross_med['Glucocorticoids']['no fracture'])

Glucocorticoids_fracture

0.20833333333333334

In [62]:
No_medication_fracture = cross_med['No medication']['fracture'] / (cross_med['No medication']['fracture']  
+ cross_med['No medication']['no fracture'])

No_medication_fracture

0.3014705882352941

# Removing columns "id" and sending "fracture" column to the end.

In [63]:
cols = bmd.columns.values.tolist()
cols

['id',
 'age',
 'sex',
 'fracture',
 'weight_kg',
 'height_cm',
 'medication',
 'waiting_time',
 'bmd']

In [64]:
cols_new = ['age',
 'sex',
 'weight_kg',
 'height_cm',
 'medication',
 'waiting_time',
 'bmd',
 'fracture']

In [65]:
bmd = bmd[cols_new]

In [66]:
dum_sex = pd.get_dummies(bmd['sex'], prefix = 'sex')
new_bmd = bmd.join(dum_sex)

In [67]:
dum_med = pd.get_dummies(new_bmd['medication'], prefix = 'medication')
new_bmd = new_bmd.join(dum_med)
new_bmd.head(3)

Unnamed: 0,age,sex,weight_kg,height_cm,medication,waiting_time,bmd,fracture,sex_F,sex_M,medication_Anticonvulsant,medication_Glucocorticoids,medication_No medication
0,57.052768,F,64.0,155.5,Anticonvulsant,18,0.8793,no fracture,1,0,1,0,0
1,75.741225,F,78.0,162.0,No medication,56,0.7946,no fracture,1,0,0,0,1
2,70.7789,M,73.0,170.5,No medication,10,0.9067,no fracture,0,1,0,0,1


In [68]:
cols = new_bmd.columns.values.tolist()
cols

['age',
 'sex',
 'weight_kg',
 'height_cm',
 'medication',
 'waiting_time',
 'bmd',
 'fracture',
 'sex_F',
 'sex_M',
 'medication_Anticonvulsant',
 'medication_Glucocorticoids',
 'medication_No medication']

In [69]:
cols_new = ['age',
 'weight_kg',
 'height_cm',
 'waiting_time',
 'bmd',
 'fracture',
 'sex_F',
 'sex_M',
 'medication_Anticonvulsant',
 'medication_Glucocorticoids',
 'medication_No medication']

In [70]:
new_bmd = new_bmd[cols_new]

In [71]:
new_bmd.head(5)

Unnamed: 0,age,weight_kg,height_cm,waiting_time,bmd,fracture,sex_F,sex_M,medication_Anticonvulsant,medication_Glucocorticoids,medication_No medication
0,57.052768,64.0,155.5,18,0.8793,no fracture,1,0,1,0,0
1,75.741225,78.0,162.0,56,0.7946,no fracture,1,0,0,0,1
2,70.7789,73.0,170.5,10,0.9067,no fracture,0,1,0,0,1
3,78.247175,60.0,148.0,14,0.7112,no fracture,1,0,0,0,1
4,54.191877,55.0,161.0,20,0.7909,no fracture,0,1,0,0,1


In [72]:
# We change the Fracture columns from string to numeric. True=1, False=0.

new_bmd['fracture'] = (new_bmd['fracture'] == 'fracture').astype(int)

In [73]:
new_bmd.head(5)

Unnamed: 0,age,weight_kg,height_cm,waiting_time,bmd,fracture,sex_F,sex_M,medication_Anticonvulsant,medication_Glucocorticoids,medication_No medication
0,57.052768,64.0,155.5,18,0.8793,0,1,0,1,0,0
1,75.741225,78.0,162.0,56,0.7946,0,1,0,0,0,1
2,70.7789,73.0,170.5,10,0.9067,0,0,1,0,0,1
3,78.247175,60.0,148.0,14,0.7112,0,1,0,0,0,1
4,54.191877,55.0,161.0,20,0.7909,0,0,1,0,0,1


In [74]:
new_bmd.fracture.value_counts()

0    119
1     50
Name: fracture, dtype: int64


# Select columns to build the model

In [75]:
new_bmd.columns.values.tolist()

['age',
 'weight_kg',
 'height_cm',
 'waiting_time',
 'bmd',
 'fracture',
 'sex_F',
 'sex_M',
 'medication_Anticonvulsant',
 'medication_Glucocorticoids',
 'medication_No medication']

In [76]:
X = ['age',
 'weight_kg',
 'height_cm',
 'waiting_time',
 'bmd',
 'sex_F',
 'sex_M',
 'medication_Anticonvulsant',
 'medication_Glucocorticoids',
 'medication_No medication']

Y = ['fracture']

# We put under X the columns to be used as predictors.
# Y is the column we want to predict

# We evaluate dtypes and assign numeric type to them

In [77]:
new_bmd.dtypes

age                           float64
weight_kg                     float64
height_cm                     float64
waiting_time                    int64
bmd                           float64
fracture                        int32
sex_F                           uint8
sex_M                           uint8
medication_Anticonvulsant       uint8
medication_Glucocorticoids      uint8
medication_No medication        uint8
dtype: object

In [78]:
new_bmd['sex_F'] = (new_bmd['sex_F'] == 'sex_F').astype(int)
new_bmd['sex_M'] = (new_bmd['sex_M'] == 'sex_M').astype(int)
new_bmd['medication_Anticonvulsant'] = (new_bmd['medication_Anticonvulsant'] == 'medication_Anticonvulsant').astype(int)
new_bmd['medication_Glucocorticoids'] = (new_bmd['medication_Glucocorticoids'] == 'medication_Glucocorticoids').astype(int)
new_bmd['medication_No medication'] = (new_bmd['medication_No medication'] == 'medication_No medication').astype(int) 
new_bmd.dtypes

  res_values = method(rvalues)


age                           float64
weight_kg                     float64
height_cm                     float64
waiting_time                    int64
bmd                           float64
fracture                        int32
sex_F                           int32
sex_M                           int32
medication_Anticonvulsant       int32
medication_Glucocorticoids      int32
medication_No medication        int32
dtype: object

# Splitting DataSet

In [89]:
X_train, X_test, Y_train, Y_test = train_test_split(new_bmd[X], new_bmd[Y], test_size=0.3)

In [90]:
clf = neighbors.KNeighborsClassifier()

In [91]:
clf.fit(X_train, Y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [92]:
accuracy = clf.score(X_test, Y_test)

In [93]:
accuracy

0.7450980392156863

# Fine Tunning our model (using 1000 iterations)

In [34]:
# We will finetune our model using two methods.
# We first will choose the best combination of columns to be used as predictors.
# Then we will change the parameters contained inside KNeighborsClassifier (algorithm, leaf_size, metric, metric_params, 
# n_neighbors, p, weights)
# We will always run 1000 times each option and calculate the mean score.

## Choose the right columns

In [35]:
A = ['age', 'weight_kg', 'height_cm', 'waiting_time', 'bmd', 'sex_F', 'sex_M', 'medication_Anticonvulsant', 'medication_Glucocorticoids', 'medication_No medication']
B = ['age', 'weight_kg', 'height_cm', 'waiting_time', 'bmd', 'medication_Anticonvulsant', 'medication_Glucocorticoids', 'medication_No medication']
C = ['age', 'weight_kg', 'height_cm', 'bmd']
D = ['age', 'bmd']
list = A,B,C,D

In [36]:
accuracyList = []
for i in list:
    X = new_bmd[i]
    Y = new_bmd['fracture']
    for r in range(1,1001):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
        clf = neighbors.KNeighborsClassifier()
        clf.fit(X_train, Y_train)
        accuracy = clf.score(X_test, Y_test)
        accuracyList.append(accuracy)
        result = sum(accuracyList)/len(accuracyList)
    print('Columns: ' + str(i) + " - Result: " + str(result))


Columns: ['age', 'weight_kg', 'height_cm', 'waiting_time', 'bmd', 'sex_F', 'sex_M', 'medication_Anticonvulsant', 'medication_Glucocorticoids', 'medication_No medication'] - Result: 0.7321764705882339
Columns: ['age', 'weight_kg', 'height_cm', 'waiting_time', 'bmd', 'medication_Anticonvulsant', 'medication_Glucocorticoids', 'medication_No medication'] - Result: 0.7325686274509816
Columns: ['age', 'weight_kg', 'height_cm', 'bmd'] - Result: 0.7330522875816999
Columns: ['age', 'bmd'] - Result: 0.7302352941176258


In [37]:
# The best score is obtained using the variables 'age', 'weight_kg', 'height_cm', 'bmd' as predictors.

## Choosing the best value for "n_neighbors"

In [38]:
X = new_bmd[['age', 'weight_kg', 'height_cm', 'bmd']]
Y = new_bmd['fracture']

In [39]:
A = 5
B = 11
C = 19
D = 25
E = 35
F = 45
list = A,B,C,D,E,F

In [40]:
accuracyList = []
for i in list:
    for r in range(1,1001):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
        clf = neighbors.KNeighborsClassifier(n_neighbors = i)
        clf.fit(X_train, Y_train)
        accuracy = clf.score(X_test, Y_test)
        accuracyList.append(accuracy)
        result = sum(accuracyList)/len(accuracyList)
    print('N_Neighbors: ' + str(i) + " - Result: " + str(result))


N_Neighbors: 5 - Result: 0.7353921568627452
N_Neighbors: 11 - Result: 0.7530784313725433
N_Neighbors: 19 - Result: 0.762444444444441
N_Neighbors: 25 - Result: 0.7661617647058919
N_Neighbors: 35 - Result: 0.7655137254902098
N_Neighbors: 45 - Result: 0.7597973856209491


In [41]:
# The best score is obtained with the value "25" for the n_neighbors parameter

## Choosing the best value for "algorithm"

In [42]:
A = "auto"
B = "ball_tree"
C = "kd_tree"
D = "brute"
list = A,B,C,D

In [43]:
accuracyList = []
for i in list:
    for r in range(1,1001):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
        clf = neighbors.KNeighborsClassifier(algorithm = i, n_neighbors = 25)
        clf.fit(X_train, Y_train)
        accuracy = clf.score(X_test, Y_test)
        accuracyList.append(accuracy)
        result = sum(accuracyList)/len(accuracyList)
    print('Algorithm: ' + str(i) + " - Result: " + str(result))

Algorithm: auto - Result: 0.7774117647058819
Algorithm: ball_tree - Result: 0.7774117647058755
Algorithm: kd_tree - Result: 0.7779411764705864
Algorithm: brute - Result: 0.7773676470588345


In [46]:
# None of the results is much higher than the others. We will keep the default option (auto)

In [47]:
# Other parameters, like 'algorithm','leaf_size','metric','metric_params','n_job','weight' and 'p' do not improve the model. 

## Our final model 

In [48]:
for r in range(1,1001):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
    clf = neighbors.KNeighborsClassifier(n_neighbors = 25)
    clf.fit(X_train, Y_train)
    accuracy = clf.score(X_test, Y_test)
    accuracyList.append(accuracy)
    result = sum(accuracyList)/len(accuracyList)
print('Final model: ' + str(i) + " - Result: " + str(result))

Final model: brute - Result: 0.7774869281046202
