In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
groups = np.array(data['Subject_ID'])

In [None]:
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=1)

In [None]:
steps = list()
steps.append(('scaler', StandardScaler()))
steps.append(('model', SVC(kernel='linear')))
pipeline = Pipeline(steps=steps)

In [None]:
cv = GroupKFold(n_splits=6)

In [None]:
scores = cross_val_score(pipeline, X_shuffled, y_shuffled, scoring='accuracy', cv=cv, n_jobs=-1, groups=groups_shuffled)

In [None]:
print('Accuracy: %.4f (%.3f)' % (mean(scores)*100, std(scores)*100))

In [None]:
accuracy = []
n_shuffles = 10
for i in range(n_shuffles):
    X_shuffle, y_shuffle, groups_shuffle = shuffle(X, y, groups, random_state=i)
    scores = cross_val_score(pipeline, X_shuffle, y_shuffle, groups=groups_shuffle, scoring='accuracy', cv=cv )
    print('Accuracy: %.4f (%.3f)' % (mean(scores)*100, std(scores)*100))
    accuracy.append(scores)

In [None]:
y_pred

In [None]:
y_pred = cross_val_predict(pipeline, X_shuffled, y_shuffled, groups=groups_shuffled, cv=cv, n_jobs=-1 )

In [None]:
len(y_pred)

In [None]:
y_shuffled

In [None]:
groups_shuffled

In [None]:
y_shuffled, y_pred, groups_shuffled

In [None]:
final_df = pd.DataFrame(list(zip(y_shuffled, y_pred, groups_shuffled)), columns = ['label', 'prediction', 's_id'])

In [None]:
final_df

In [None]:
final_df.loc[final_df['prediction']==1].duplicated().describe() #41 with False 

In [None]:
final_df.loc[final_df['prediction']==0].duplicated().describe() # 42 with 

In [None]:
# 1. discard all people where there is a disagree in label 
final_df['discarded'] = final_df[['s_id','prediction']].duplicated(keep=False) # duplicated means True; disagree in label means false

In [None]:
final_df['discarded']

In [None]:
final_df[['s_id','prediction']].duplicated(keep=False)

In [None]:
# keep only those where there is agree  in label ; so duplicated ones 
final_df.loc[final_df['discarded']==True] # so we have 68 where there is agreement in label in the classifier output
# and 49 where there isn't
# so 68 with both recordings in agreement; so actually 34 people out of the 56 that have both recordings
# so the rest of 22 people we leave to clinician

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df.sort_values(by=['s_id']))

In [None]:
df_disagree = final_df.loc[final_df['discarded']==False]

In [None]:
df_disagree

In [None]:
df_disagree= df_disagree.drop([79, 0, 42, 88, 33])

In [None]:
len(df_disagree.loc[df_disagree['label']==0])  # 22 out of 46 
# 11 out of 23 children for whom there was a disagree actually got a no_rad

In [None]:
final_discarded = final_df.loc[final_df['discarded']==True]

In [None]:
len(final_discarded) # 34 people with both recordings agreeement

In [None]:
df = final_discarded.loc[final_discarded['discarded']==True]
accuracy=0
for x,y in zip(final_discarded['label'], final_discarded['prediction']):
        if x==y:
            accuracy +=1
accuracy

In [None]:
# so 23 are correct out of 56 and 11 are wrong

In [None]:
accuracy/68

In [None]:
fp = 0
fn = 0
tp = 0
for x,y in zip(final_discarded['label'], final_discarded['prediction']):
        if x!=y and x==1:
            fp +=1
        if x!=y and x==0:
            fn +=1
        if x==y and x==1:
            tp +=1

In [None]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
recall, precision

In [None]:
df.sort_values(by=['s_id'])

In [None]:
final_discarded.sort_values(by='s_id')