In [1]:
import pandas as pd
import numpy as np
file = '../../../DATA/OSA_extreme_male.xlsx'
data = pd.read_excel(file)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Patient,Gender,IAH,Weight,Height,Age,Cervical,Smoker,Snorer,Illness,OSA,BMI
0,P0005,1,9.0,80,173,32,40,2,4,2,Healthy,26.729927
1,P0006,1,2.0,109,190,32,42,2,4,2,Healthy,30.193906
2,P0007,1,34.0,86,169,39,42,2,4,2,Severe,30.110991
3,P0008,1,60.0,145,172,47,44,2,4,2,Severe,49.01298
4,P0013,1,3.7,90,180,36,40,3,4,2,Healthy,27.777778


In [2]:
df = df.drop(['Gender','Smoker', 'Snorer', 'Illness'], axis=1)
df.head()

Unnamed: 0,Patient,IAH,Weight,Height,Age,Cervical,OSA,BMI
0,P0005,9.0,80,173,32,40,Healthy,26.729927
1,P0006,2.0,109,190,32,42,Healthy,30.193906
2,P0007,34.0,86,169,39,42,Severe,30.110991
3,P0008,60.0,145,172,47,44,Severe,49.01298
4,P0013,3.7,90,180,36,40,Healthy,27.777778


In [3]:
#Index transformation
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['OSA']=le.fit_transform(df['OSA'])


In [4]:
df.head()

Unnamed: 0,Patient,IAH,Weight,Height,Age,Cervical,OSA,BMI
0,P0005,9.0,80,173,32,40,0,26.729927
1,P0006,2.0,109,190,32,42,0,30.193906
2,P0007,34.0,86,169,39,42,1,30.110991
3,P0008,60.0,145,172,47,44,1,49.01298
4,P0013,3.7,90,180,36,40,0,27.777778


In [5]:
df.set_index('Patient', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,IAH,Weight,Height,Age,Cervical,OSA,BMI
Patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P0005,9.0,80,173,32,40,0,26.729927
P0006,2.0,109,190,32,42,0,30.193906
P0007,34.0,86,169,39,42,1,30.110991
P0008,60.0,145,172,47,44,1,49.01298
P0013,3.7,90,180,36,40,0,27.777778


In [7]:
#features 
predictors = ['IAH','Weight', 'Height', 'Age', 'Cervical', 'BMI']
x = df[predictors].values
y = df['OSA'].values

# Logistic regression 
### Grid search

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

### grid search with splitted data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param_grid = {"C": np.logspace(-3,3,7), "penalty" : ["l1","l2"]}

logreg_1 = LogisticRegression()
logreg_1_cv = GridSearchCV(logreg_1,param_grid,cv = 10)
logreg_1_cv.fit(x_train,y_train)

In [None]:
print("tuned hyperparameters: (best parameters): ", logreg_1_cv.best_params_)
print("accuracy: ", logreg_1_cv.best_score_)

### grid search with  NO splitted data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param_grid = {"C": np.logspace(-3,3,7), "penalty" : ["l1","l2"]}

logreg_2 = LogisticRegression()
logreg_2_cv = GridSearchCV(logreg_2,param_grid,cv = 10)
logreg_2_cv.fit(x_train,y_train)

print("tuned hyperparameters: (best parameters): ", logreg_2_cv.best_params_)
print("accuracy: ", logreg_2_cv.best_score_)

In [16]:
df.describe

<bound method NDFrame.describe of           IAH  Weight  Height  Age  Cervical  OSA        BMI
Patient                                                     
P0005     9.0      80     173   32        40    0  26.729927
P0006     2.0     109     190   32        42    0  30.193906
P0007    34.0      86     169   39        42    1  30.110991
P0008    60.0     145     172   47        44    1  49.012980
P0013     3.7      90     180   36        40    0  27.777778
...       ...     ...     ...  ...       ...  ...        ...
P0668    41.5     113     180   44        44    1  34.876543
P0669     2.7      85     180   33        40    0  26.234568
P0674    33.0      90     179   57        42    1  28.089011
P0679     9.2     105     180   35        45    0  32.407407
P0680    52.2      90     180   50        42    1  27.777778

[277 rows x 7 columns]>

In [20]:
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn import metrics


k = 5
kf = KFold(n_splits=k)
precisions = []
i = 0
accuracy_global = []
f1_global = []

for train, test in kf.split(x):
    i+=1
    x_train, x_test = x[train], x[test]
    y_train, y_test = y[train], y[test]
    model = LogisticRegression(C=0.001, penalty='l2').fit(x_train,y_train)
    y_out = model.predict(x_test)
    
    accuracy = metrics.accuracy_score(y_test, y_out)
    f1_score = metrics.f1_score(y_test, y_out, average="weighted")
   
    accuracy_global.append(accuracy)
    f1_global.append(f1_score)
    print(metrics.confusion_matrix(y_test, y_out))
    print(metrics.classification_report(y_test, y_out))
    
    
    print("Iteration {}:accuracy={:.4f} f1={:.4f}".format(i, accuracy, f1_score))
print("Global: accuracy={:.4f} f1={:.4f}\n".format(np.average(accuracy_global), np.average(f1_global)))



[[29  0]
 [ 0 27]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00        27

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56

Iteration 1:accuracy=1.0000 f1=1.0000
[[34  0]
 [ 0 22]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        22

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56

Iteration 2:accuracy=1.0000 f1=1.0000
[[25  0]
 [ 0 30]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       1.00      1.00      1.00        30

    accuracy                           1.00      

# KNN

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {"n-"}

logreg_2 = LogisticRegression()
logreg_2_cv = GridSearchCV(logreg_2,param_grid,cv = 10)
logreg_2_cv.fit(x_train,y_train)

print("tuned hyperparameters: (best parameters): ", logreg_2_cv.best_params_)
print("accuracy: ", logreg_2_cv.best_score_)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn import metrics

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

k = 5
kf = KFold(n_splits=k)
precisions = []
i = 0
accuracy_global = []
f1_global = []

for train, test in kf.split(x):
    i+=1
    x_train, x_test = x[train], x[test]
    y_train, y_test = y[train], y[test]
    model = KNeighborsClassifier(n_neighbors=4).fit(x_train,y_train)
    y_out = model.predict(x_test)
    
    # Measure accuracy
    score = metrics.accuracy_score(y_test, y_out)
    accuracy_global.append(accuracy)
    
    print("Iteration {}:accuracy={:.4f}".format(i, accuracy))
print("Global: accuracy={:.4f} \n".format(np.average(accuracy_global)))
