In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
import itertools 
from matplotlib.pyplot import figure

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
groups = np.array(data['Subject_ID'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X,y)

In [None]:
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=1)
group_k_fold = GroupKFold(n_splits=6)
splits = group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled)

In [None]:
model = SVC(kernel='linear')

In [None]:
list_indexes_train = list()
list_indexes_test =list()
temp_list = list()
predictions = list()
y_test_index = list()

for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    model.fit(X_shuffled[train_index], y_shuffled[train_index])
    temp_list = model.predict(X_shuffled[test_index])
    temp_list2=list()
    temp_list3=list()
    temp_list2.append(groups_shuffled[test_index])
    temp_list3.append(y_shuffled[test_index])
    y_test_index.append(temp_list3)
    list_indexes_test.append(temp_list2)
    #coefs.append(model.coef_)
    predictions.append(temp_list)

In [None]:
labels=list()
for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    labels.append(y_shuffled[test_index])

In [None]:
k=0
tuple_index_pred = list()
while k<6:
    for x, y,z in zip(list_indexes_test[k][0], predictions[k], y_test_index[k][0]):
        tuple_index_pred.append((x,y,z))
    k = k+1

In [None]:
t_list = list()
t_list2 = list()
t_list3=list()
for x in tuple_index_pred:
    t_list.append(x[0])
    t_list2.append(x[1])
    t_list3.append(x[2])

final_df = pd.DataFrame(list(zip(t_list, t_list2, t_list3)), columns = ['index', 'label', 'ground_truth'])
final_df

In [None]:
final_df.loc[final_df['label']==1].duplicated().describe() #46 with False and 19 with True so 19 people have both 1

In [None]:
final_df.loc[final_df['label']==0].duplicated().describe() #38 with False and 14 with True so 14 people have both 0

In [None]:
# out of 56 we have 12 that have both recordings correct and 44 where either both are or only one is incorrect

In [None]:
# 1. discard all people where there is a disagree in label 
final_df['discarded'] = final_df[['index','label']].duplicated(keep=False) # duplicated means True; disagree in label means false

In [None]:
# keep only those where there is agree  in label ; so duplicated ones 
final_df.loc[final_df['discarded']==True] # so we have 66 where there is agreement in label in the classifier output
# and 51 where there isn't
# so 66 with both recordings in agreement; so actually 33 people out of the 56 that have both recordings
# so the rest of 23 people we leave to clinician

In [None]:
df_disagree = final_df.loc[final_df['discarded']==False]


In [None]:
len(df_disagree)-5 = 46

In [None]:
df_disagree= df_disagree.drop([102, 73, 83, 44, 47])

In [None]:
# now calculate accuracy over these
final_discarded = final_df.loc[final_df['discarded']==True]

In [None]:
accuracy=0
for x,y in zip(final_discarded['label'], final_discarded['ground_truth']):
        if x==y:
            accuracy +=1
accuracy

In [None]:
fp = 0
fn = 0
tp = 0
for x,y in zip(final_discarded['label'], final_discarded['ground_truth']):
        if x!=y and x==1:
            fp +=1
        if x!=y and x==0:
            fn +=1
        if x==y and x==1:
            tp +=1

In [None]:
accuracy/66*100 # out of those that there's agreement 69% accuracy 

In [None]:
# what would we expect if it was random: 
final_discarded['ground_truth'].value_counts() # 18 people with no rad and 15 with rad out of those we keep 

In [None]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
recall, precision

In [None]:
# now plot new cm and maybe ROC 
# confusion matrix at person level 
y_true = np.array(final_discarded['ground_truth'])
y_pred = np.array(final_discarded['label'])

In [None]:
cm_person = confusion_matrix(y_true, y_pred)

In [None]:
figure(figsize=(8, 6), dpi=80)
plt.imshow(cm_person,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix Person level')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_shuffled[test_index]))) # length of classes
class_labels = ['no rad','rad']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cm_person.max() / 2.
for i,j in itertools.product(range(cm_person.shape[0]),range(cm_person.shape[1])):
    plt.text(j,i,format(cm_person[i,j],'d'),horizontalalignment='center',color='white' if cm_person[i,j] >thresh else 'black')
plt.show();