In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [40]:
%matplotlib inline

In [58]:
# Url to the raw csv file in the github repository
url = 'https://raw.githubusercontent.com/amlannandy/FitnessLive/master/prediction-models/data/asthma.csv'

In [59]:
# Read data from csv
asthma = pd.read_csv(url)
print(asthma.columns)

Index(['Diagnosis', 'ID', 'Imaginary Part', 'Unnamed: 3', 'Real Part',
       'Unnamed: 5', 'Gender', 'Age', 'Smoking', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')


In [60]:
asthma = asthma.iloc[3:]

In [61]:
asthma = asthma[asthma.columns[:-3]]

In [62]:
# Show first 5 rows of data
asthma.head(5)

Unnamed: 0,Diagnosis,ID,Imaginary Part,Unnamed: 3,Real Part,Unnamed: 5,Gender,Age,Smoking,Unnamed: 9
3,COPD,302-3,-325.39,-314.7503595,-473.73,-469.2631404,0.0,72.0,2.0,
4,COPD,303-3,-323.0,-317.4360556,-476.12,-471.8976667,1.0,73.0,3.0,
5,COPD,304-4,-327.78,-317.3996698,-473.73,-468.856388,1.0,76.0,2.0,
6,COPD,305-4,-325.39,-316.1557853,-478.52,-472.8697828,0.0,65.0,2.0,
7,COPD,306-3,-327.78,-318.6775535,-507.23,-469.0241943,1.0,60.0,2.0,


In [63]:
asthma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 3 to 400
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Diagnosis       398 non-null    object 
 1   ID              398 non-null    object 
 2   Imaginary Part  99 non-null     object 
 3   Unnamed: 3      99 non-null     object 
 4   Real Part       99 non-null     object 
 5   Unnamed: 5      99 non-null     object 
 6   Gender          398 non-null    float64
 7   Age             398 non-null    float64
 8   Smoking         398 non-null    float64
 9   Unnamed: 9      0 non-null      float64
dtypes: float64(4), object(6)
memory usage: 31.2+ KB


In [68]:
asthma.drop('ID', inplace=True, axis=1)
asthma['Diagnosis'].fillna('COPD', inplace=True)
asthma.fillna(0, inplace=True)

In [69]:
asthma.head(20)

Unnamed: 0,Diagnosis,Imaginary Part,Unnamed: 3,Real Part,Unnamed: 5,Gender,Age,Smoking,Unnamed: 9
3,COPD,-325.39,-314.7503595,-473.73,-469.2631404,0.0,72.0,2.0,0.0
4,COPD,-323.0,-317.4360556,-476.12,-471.8976667,1.0,73.0,3.0,0.0
5,COPD,-327.78,-317.3996698,-473.73,-468.856388,1.0,76.0,2.0,0.0
6,COPD,-325.39,-316.1557853,-478.52,-472.8697828,0.0,65.0,2.0,0.0
7,COPD,-327.78,-318.6775535,-507.23,-469.0241943,1.0,60.0,2.0,0.0
8,COPD,-330.18,-320.6174777,-473.73,-467.3618538,1.0,76.0,2.0,0.0
9,COPD,0.0,0.0,0.0,0.0,1.0,77.0,2.0,0.0
10,COPD,-320.61,-307.5995856,-476.12,-470.1816328,1.0,74.0,2.0,0.0
11,COPD,-315.82,-300.104765,-473.73,-466.3786343,1.0,67.0,2.0,0.0
12,COPD,-320.61,-308.117898,-471.34,-466.13057,1.0,65.0,2.0,0.0


In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(asthma.loc[:, asthma.columns != 'Gender'], asthma['Gender'], stratify=asthma['Gender'], random_state=66)
from sklearn.neighbors import KNeighborsClassifier
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 10
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
    # build the model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(knn.score(X_train, y_train))
    # record test set accuracy
    test_accuracy.append(knn.score(X_test, y_test))

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.79
Accuracy of K-NN classifier on test set: 0.78


In [None]:
sample = np.array([6, 128, 72, 35, 0, 33.6, 0.627, 50])
sample = sample.reshape(1, -1)
print(sample)

[[  6.    128.     72.     35.      0.     33.6     0.627  50.   ]]


In [None]:
res = knn.predict(sample)
prob = knn.predict_proba(sample)
print(res, prob)

[1] [[0.44444444 0.55555556]]


In [None]:
import pickle

In [None]:
pickle.dump(knn, open('model.pkl', 'wb'))